nv50_program.c revision 81de711fc864247419221d700bd045addf22cb52
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93struct nv50_pc {
94	struct nv50_program *p;
95
96	/* hw resources */
97	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	/* broadcast and destination replacement regs */
116	struct nv50_reg *r_brdc;
117	struct nv50_reg *r_dst[4];
118
119	unsigned interp_mode[32];
120	/* perspective interpolation registers */
121	struct nv50_reg *iv_p;
122	struct nv50_reg *iv_c;
123
124	/* current instruction and total number of insns */
125	unsigned insn_cur;
126	unsigned insn_nr;
127
128	boolean allow32;
129};
130
131static void
132alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
133{
134	int i = 0;
135
136	if (reg->type == P_RESULT) {
137		if (pc->p->cfg.high_result < (reg->hw + 1))
138			pc->p->cfg.high_result = reg->hw + 1;
139	}
140
141	if (reg->type != P_TEMP)
142		return;
143
144	if (reg->hw >= 0) {
145		/*XXX: do this here too to catch FP temp-as-attr usage..
146		 *     not clean, but works */
147		if (pc->p->cfg.high_temp < (reg->hw + 1))
148			pc->p->cfg.high_temp = reg->hw + 1;
149		return;
150	}
151
152	if (reg->rhw != -1) {
153		/* try to allocate temporary with index rhw first */
154		if (!(pc->r_temp[reg->rhw])) {
155			pc->r_temp[reg->rhw] = reg;
156			reg->hw = reg->rhw;
157			if (pc->p->cfg.high_temp < (reg->rhw + 1))
158				pc->p->cfg.high_temp = reg->rhw + 1;
159			return;
160		}
161		/* make sure we don't get things like $r0 needs to go
162		 * in $r1 and $r1 in $r0
163		 */
164		i = pc->result_nr * 4;
165	}
166
167	for (; i < NV50_SU_MAX_TEMP; i++) {
168		if (!(pc->r_temp[i])) {
169			pc->r_temp[i] = reg;
170			reg->hw = i;
171			if (pc->p->cfg.high_temp < (i + 1))
172				pc->p->cfg.high_temp = i + 1;
173			return;
174		}
175	}
176
177	assert(0);
178}
179
180static struct nv50_reg *
181alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
182{
183	struct nv50_reg *r;
184	int i;
185
186	if (dst && dst->type == P_TEMP && dst->hw == -1)
187		return dst;
188
189	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
190		if (!pc->r_temp[i]) {
191			r = CALLOC_STRUCT(nv50_reg);
192			r->type = P_TEMP;
193			r->index = -1;
194			r->hw = i;
195			r->rhw = -1;
196			pc->r_temp[i] = r;
197			return r;
198		}
199	}
200
201	assert(0);
202	return NULL;
203}
204
205/* Assign the hw of the discarded temporary register src
206 * to the tgsi register dst and free src.
207 */
208static void
209assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
210{
211	assert(src->index == -1 && src->hw != -1);
212
213	if (dst->hw != -1)
214		pc->r_temp[dst->hw] = NULL;
215	pc->r_temp[src->hw] = dst;
216	dst->hw = src->hw;
217
218	FREE(src);
219}
220
221/* release the hardware resource held by r */
222static void
223release_hw(struct nv50_pc *pc, struct nv50_reg *r)
224{
225	assert(r->type == P_TEMP);
226	if (r->hw == -1)
227		return;
228
229	assert(pc->r_temp[r->hw] == r);
230	pc->r_temp[r->hw] = NULL;
231
232	r->acc = 0;
233	if (r->index == -1)
234		FREE(r);
235}
236
237static void
238free_temp(struct nv50_pc *pc, struct nv50_reg *r)
239{
240	if (r->index == -1) {
241		unsigned hw = r->hw;
242
243		FREE(pc->r_temp[hw]);
244		pc->r_temp[hw] = NULL;
245	}
246}
247
248static int
249alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
250{
251	int i;
252
253	if ((idx + 4) >= NV50_SU_MAX_TEMP)
254		return 1;
255
256	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
257	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
258		return alloc_temp4(pc, dst, idx + 4);
259
260	for (i = 0; i < 4; i++) {
261		dst[i] = CALLOC_STRUCT(nv50_reg);
262		dst[i]->type = P_TEMP;
263		dst[i]->index = -1;
264		dst[i]->hw = idx + i;
265		pc->r_temp[idx + i] = dst[i];
266	}
267
268	return 0;
269}
270
271static void
272free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
273{
274	int i;
275
276	for (i = 0; i < 4; i++)
277		free_temp(pc, reg[i]);
278}
279
280static struct nv50_reg *
281temp_temp(struct nv50_pc *pc)
282{
283	if (pc->temp_temp_nr >= 16)
284		assert(0);
285
286	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
287	return pc->temp_temp[pc->temp_temp_nr++];
288}
289
290static void
291kill_temp_temp(struct nv50_pc *pc)
292{
293	int i;
294
295	for (i = 0; i < pc->temp_temp_nr; i++)
296		free_temp(pc, pc->temp_temp[i]);
297	pc->temp_temp_nr = 0;
298}
299
300static int
301ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
302{
303	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
304			       (pc->immd_nr + 1) * 4 * sizeof(float));
305	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
306	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
307	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
308	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
309
310	return pc->immd_nr++;
311}
312
313static struct nv50_reg *
314alloc_immd(struct nv50_pc *pc, float f)
315{
316	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
317	unsigned hw;
318
319	for (hw = 0; hw < pc->immd_nr * 4; hw++)
320		if (pc->immd_buf[hw] == f)
321			break;
322
323	if (hw == pc->immd_nr * 4)
324		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
325
326	r->type = P_IMMD;
327	r->hw = hw;
328	r->index = -1;
329	return r;
330}
331
332static struct nv50_program_exec *
333exec(struct nv50_pc *pc)
334{
335	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
336
337	e->param.index = -1;
338	return e;
339}
340
341static void
342emit(struct nv50_pc *pc, struct nv50_program_exec *e)
343{
344	struct nv50_program *p = pc->p;
345
346	if (p->exec_tail)
347		p->exec_tail->next = e;
348	if (!p->exec_head)
349		p->exec_head = e;
350	p->exec_tail = e;
351	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
352}
353
354static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
355
356static boolean
357is_long(struct nv50_program_exec *e)
358{
359	if (e->inst[0] & 1)
360		return TRUE;
361	return FALSE;
362}
363
364static boolean
365is_immd(struct nv50_program_exec *e)
366{
367	if (is_long(e) && (e->inst[1] & 3) == 3)
368		return TRUE;
369	return FALSE;
370}
371
372static INLINE void
373set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
374	 struct nv50_program_exec *e)
375{
376	set_long(pc, e);
377	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
378	e->inst[1] |= (pred << 7) | (idx << 12);
379}
380
381static INLINE void
382set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
383	    struct nv50_program_exec *e)
384{
385	set_long(pc, e);
386	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
387	e->inst[1] |= (idx << 4) | (on << 6);
388}
389
390static INLINE void
391set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
392{
393	if (is_long(e))
394		return;
395
396	e->inst[0] |= 1;
397	set_pred(pc, 0xf, 0, e);
398	set_pred_wr(pc, 0, 0, e);
399}
400
401static INLINE void
402set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
403{
404	if (dst->type == P_RESULT) {
405		set_long(pc, e);
406		e->inst[1] |= 0x00000008;
407	}
408
409	alloc_reg(pc, dst);
410	e->inst[0] |= (dst->hw << 2);
411}
412
413static INLINE void
414set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
415{
416	float f = pc->immd_buf[imm->hw];
417	unsigned val = fui(imm->neg ? -f : f);
418
419	set_long(pc, e);
420	/*XXX: can't be predicated - bits overlap.. catch cases where both
421	 *     are required and avoid them. */
422	set_pred(pc, 0, 0, e);
423	set_pred_wr(pc, 0, 0, e);
424
425	e->inst[1] |= 0x00000002 | 0x00000001;
426	e->inst[0] |= (val & 0x3f) << 16;
427	e->inst[1] |= (val >> 6) << 2;
428}
429
430
431#define INTERP_LINEAR		0
432#define INTERP_FLAT			1
433#define INTERP_PERSPECTIVE	2
434#define INTERP_CENTROID		4
435
436/* interpolant index has been stored in dst->rhw */
437static void
438emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
439		unsigned mode)
440{
441	assert(dst->rhw != -1);
442	struct nv50_program_exec *e = exec(pc);
443
444	e->inst[0] |= 0x80000000;
445	set_dst(pc, dst, e);
446	e->inst[0] |= (dst->rhw << 16);
447
448	if (mode & INTERP_FLAT) {
449		e->inst[0] |= (1 << 8);
450	} else {
451		if (mode & INTERP_PERSPECTIVE) {
452			e->inst[0] |= (1 << 25);
453			alloc_reg(pc, iv);
454			e->inst[0] |= (iv->hw << 9);
455		}
456
457		if (mode & INTERP_CENTROID)
458			e->inst[0] |= (1 << 24);
459	}
460
461	emit(pc, e);
462}
463
464static void
465set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
466	 struct nv50_program_exec *e)
467{
468	set_long(pc, e);
469
470	e->param.index = src->hw;
471	e->param.shift = s;
472	e->param.mask = m << (s % 32);
473
474	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
475}
476
477static void
478emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
479{
480	struct nv50_program_exec *e = exec(pc);
481
482	e->inst[0] |= 0x10000000;
483
484	set_dst(pc, dst, e);
485
486	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
487		set_immd(pc, src, e);
488		/*XXX: 32-bit, but steals part of "half" reg space - need to
489		 *     catch and handle this case if/when we do half-regs
490		 */
491	} else
492	if (src->type == P_IMMD || src->type == P_CONST) {
493		set_long(pc, e);
494		set_data(pc, src, 0x7f, 9, e);
495		e->inst[1] |= 0x20000000; /* src0 const? */
496	} else {
497		if (src->type == P_ATTR) {
498			set_long(pc, e);
499			e->inst[1] |= 0x00200000;
500		}
501
502		alloc_reg(pc, src);
503		e->inst[0] |= (src->hw << 9);
504	}
505
506	if (is_long(e) && !is_immd(e)) {
507		e->inst[1] |= 0x04000000; /* 32-bit */
508		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
509		if (!(e->inst[1] & 0x20000000))
510			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
511	} else
512		e->inst[0] |= 0x00008000;
513
514	emit(pc, e);
515}
516
517static INLINE void
518emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
519{
520	struct nv50_reg *imm = alloc_immd(pc, f);
521	emit_mov(pc, dst, imm);
522	FREE(imm);
523}
524
525static boolean
526check_swap_src_0_1(struct nv50_pc *pc,
527		   struct nv50_reg **s0, struct nv50_reg **s1)
528{
529	struct nv50_reg *src0 = *s0, *src1 = *s1;
530
531	if (src0->type == P_CONST) {
532		if (src1->type != P_CONST) {
533			*s0 = src1;
534			*s1 = src0;
535			return TRUE;
536		}
537	} else
538	if (src1->type == P_ATTR) {
539		if (src0->type != P_ATTR) {
540			*s0 = src1;
541			*s1 = src0;
542			return TRUE;
543		}
544	}
545
546	return FALSE;
547}
548
549static void
550set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
551{
552	if (src->type == P_ATTR) {
553		set_long(pc, e);
554		e->inst[1] |= 0x00200000;
555	} else
556	if (src->type == P_CONST || src->type == P_IMMD) {
557		struct nv50_reg *temp = temp_temp(pc);
558
559		emit_mov(pc, temp, src);
560		src = temp;
561	}
562
563	alloc_reg(pc, src);
564	e->inst[0] |= (src->hw << 9);
565}
566
567static void
568set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
569{
570	if (src->type == P_ATTR) {
571		struct nv50_reg *temp = temp_temp(pc);
572
573		emit_mov(pc, temp, src);
574		src = temp;
575	} else
576	if (src->type == P_CONST || src->type == P_IMMD) {
577		assert(!(e->inst[0] & 0x00800000));
578		if (e->inst[0] & 0x01000000) {
579			struct nv50_reg *temp = temp_temp(pc);
580
581			emit_mov(pc, temp, src);
582			src = temp;
583		} else {
584			set_data(pc, src, 0x7f, 16, e);
585			e->inst[0] |= 0x00800000;
586		}
587	}
588
589	alloc_reg(pc, src);
590	e->inst[0] |= (src->hw << 16);
591}
592
593static void
594set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
595{
596	set_long(pc, e);
597
598	if (src->type == P_ATTR) {
599		struct nv50_reg *temp = temp_temp(pc);
600
601		emit_mov(pc, temp, src);
602		src = temp;
603	} else
604	if (src->type == P_CONST || src->type == P_IMMD) {
605		assert(!(e->inst[0] & 0x01000000));
606		if (e->inst[0] & 0x00800000) {
607			struct nv50_reg *temp = temp_temp(pc);
608
609			emit_mov(pc, temp, src);
610			src = temp;
611		} else {
612			set_data(pc, src, 0x7f, 32+14, e);
613			e->inst[0] |= 0x01000000;
614		}
615	}
616
617	alloc_reg(pc, src);
618	e->inst[1] |= (src->hw << 14);
619}
620
621static void
622emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
623	 struct nv50_reg *src1)
624{
625	struct nv50_program_exec *e = exec(pc);
626
627	e->inst[0] |= 0xc0000000;
628
629	if (!pc->allow32)
630		set_long(pc, e);
631
632	check_swap_src_0_1(pc, &src0, &src1);
633	set_dst(pc, dst, e);
634	set_src_0(pc, src0, e);
635	if (src1->type == P_IMMD && !is_long(e)) {
636		if (src0->neg)
637			e->inst[0] |= 0x00008000;
638		set_immd(pc, src1, e);
639	} else {
640		set_src_1(pc, src1, e);
641		if (src0->neg ^ src1->neg) {
642			if (is_long(e))
643				e->inst[1] |= 0x08000000;
644			else
645				e->inst[0] |= 0x00008000;
646		}
647	}
648
649	emit(pc, e);
650}
651
652static void
653emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
654	 struct nv50_reg *src0, struct nv50_reg *src1)
655{
656	struct nv50_program_exec *e = exec(pc);
657
658	e->inst[0] |= 0xb0000000;
659
660	check_swap_src_0_1(pc, &src0, &src1);
661
662	if (!pc->allow32 || src0->neg || src1->neg) {
663		set_long(pc, e);
664		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
665	}
666
667	set_dst(pc, dst, e);
668	set_src_0(pc, src0, e);
669	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
670		set_src_2(pc, src1, e);
671	else
672	if (src1->type == P_IMMD)
673		set_immd(pc, src1, e);
674	else
675		set_src_1(pc, src1, e);
676
677	emit(pc, e);
678}
679
680static void
681emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
682	    struct nv50_reg *src0, struct nv50_reg *src1)
683{
684	struct nv50_program_exec *e = exec(pc);
685
686	set_long(pc, e);
687	e->inst[0] |= 0xb0000000;
688	e->inst[1] |= (sub << 29);
689
690	check_swap_src_0_1(pc, &src0, &src1);
691	set_dst(pc, dst, e);
692	set_src_0(pc, src0, e);
693	set_src_1(pc, src1, e);
694
695	emit(pc, e);
696}
697
698static INLINE void
699emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
700	 struct nv50_reg *src1)
701{
702	src1->neg ^= 1;
703	emit_add(pc, dst, src0, src1);
704	src1->neg ^= 1;
705}
706
707static void
708emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
709	 struct nv50_reg *src1, struct nv50_reg *src2)
710{
711	struct nv50_program_exec *e = exec(pc);
712
713	e->inst[0] |= 0xe0000000;
714
715	check_swap_src_0_1(pc, &src0, &src1);
716	set_dst(pc, dst, e);
717	set_src_0(pc, src0, e);
718	set_src_1(pc, src1, e);
719	set_src_2(pc, src2, e);
720
721	if (src0->neg ^ src1->neg)
722		e->inst[1] |= 0x04000000;
723	if (src2->neg)
724		e->inst[1] |= 0x08000000;
725
726	emit(pc, e);
727}
728
729static INLINE void
730emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
731	 struct nv50_reg *src1, struct nv50_reg *src2)
732{
733	src2->neg ^= 1;
734	emit_mad(pc, dst, src0, src1, src2);
735	src2->neg ^= 1;
736}
737
738static void
739emit_flop(struct nv50_pc *pc, unsigned sub,
740	  struct nv50_reg *dst, struct nv50_reg *src)
741{
742	struct nv50_program_exec *e = exec(pc);
743
744	e->inst[0] |= 0x90000000;
745	if (sub) {
746		set_long(pc, e);
747		e->inst[1] |= (sub << 29);
748	}
749
750	set_dst(pc, dst, e);
751	set_src_0(pc, src, e);
752
753	emit(pc, e);
754}
755
756static void
757emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
758{
759	struct nv50_program_exec *e = exec(pc);
760
761	e->inst[0] |= 0xb0000000;
762
763	set_dst(pc, dst, e);
764	set_src_0(pc, src, e);
765	set_long(pc, e);
766	e->inst[1] |= (6 << 29) | 0x00004000;
767
768	emit(pc, e);
769}
770
771static void
772emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
773{
774	struct nv50_program_exec *e = exec(pc);
775
776	e->inst[0] |= 0xb0000000;
777
778	set_dst(pc, dst, e);
779	set_src_0(pc, src, e);
780	set_long(pc, e);
781	e->inst[1] |= (6 << 29);
782
783	emit(pc, e);
784}
785
786#define CVTOP_RN	0x01
787#define CVTOP_FLOOR	0x03
788#define CVTOP_CEIL	0x05
789#define CVTOP_TRUNC	0x07
790#define CVTOP_SAT	0x08
791#define CVTOP_ABS	0x10
792
793#define CVT_F32_F32 0xc4
794#define CVT_F32_S32 0x44
795#define CVT_F32_U32 0x64
796#define CVT_S32_F32 0x8c
797#define CVT_S32_S32 0x0c
798#define CVT_F32_F32_ROP 0xcc
799
800static void
801emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
802	 int wp, unsigned cop, unsigned fmt)
803{
804	struct nv50_program_exec *e;
805
806	e = exec(pc);
807	set_long(pc, e);
808
809	e->inst[0] |= 0xa0000000;
810	e->inst[1] |= 0x00004000;
811	e->inst[1] |= (cop << 16);
812	e->inst[1] |= (fmt << 24);
813	set_src_0(pc, src, e);
814
815	if (wp >= 0)
816		set_pred_wr(pc, 1, wp, e);
817
818	if (dst)
819		set_dst(pc, dst, e);
820	else {
821		e->inst[0] |= 0x000001fc;
822		e->inst[1] |= 0x00000008;
823	}
824
825	emit(pc, e);
826}
827
828static void
829emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
830	 struct nv50_reg *src0, struct nv50_reg *src1)
831{
832	struct nv50_program_exec *e = exec(pc);
833	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
834	struct nv50_reg *rdst;
835
836	assert(c_op <= 7);
837	if (check_swap_src_0_1(pc, &src0, &src1))
838		c_op = inv_cop[c_op];
839
840	rdst = dst;
841	if (dst->type != P_TEMP)
842		dst = alloc_temp(pc, NULL);
843
844	/* set.u32 */
845	set_long(pc, e);
846	e->inst[0] |= 0xb0000000;
847	e->inst[1] |= (3 << 29);
848	e->inst[1] |= (c_op << 14);
849	/*XXX: breaks things, .u32 by default?
850	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
851	 *     doesn't seem to match what the hw actually does.
852	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
853	 */
854	set_dst(pc, dst, e);
855	set_src_0(pc, src0, e);
856	set_src_1(pc, src1, e);
857	emit(pc, e);
858
859	/* cvt.f32.u32 */
860	e = exec(pc);
861	e->inst[0] = 0xa0000001;
862	e->inst[1] = 0x64014780;
863	set_dst(pc, rdst, e);
864	set_src_0(pc, dst, e);
865	emit(pc, e);
866
867	if (dst != rdst)
868		free_temp(pc, dst);
869}
870
871static INLINE void
872emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
873{
874	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
875}
876
877static void
878emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
879	 struct nv50_reg *v, struct nv50_reg *e)
880{
881	struct nv50_reg *temp = alloc_temp(pc, NULL);
882
883	emit_flop(pc, 3, temp, v);
884	emit_mul(pc, temp, temp, e);
885	emit_preex2(pc, temp, temp);
886	emit_flop(pc, 6, dst, temp);
887
888	free_temp(pc, temp);
889}
890
891static INLINE void
892emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
893{
894	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
895}
896
897static INLINE void
898emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
899{
900	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
901}
902
903static void
904emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
905	 struct nv50_reg **src)
906{
907	struct nv50_reg *one = alloc_immd(pc, 1.0);
908	struct nv50_reg *zero = alloc_immd(pc, 0.0);
909	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
910	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
911	struct nv50_reg *tmp[4];
912	boolean allow32 = pc->allow32;
913
914	pc->allow32 = FALSE;
915
916	if (mask & (3 << 1)) {
917		tmp[0] = alloc_temp(pc, NULL);
918		emit_minmax(pc, 4, tmp[0], src[0], zero);
919	}
920
921	if (mask & (1 << 2)) {
922		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
923
924		tmp[1] = temp_temp(pc);
925		emit_minmax(pc, 4, tmp[1], src[1], zero);
926
927		tmp[3] = temp_temp(pc);
928		emit_minmax(pc, 4, tmp[3], src[3], neg128);
929		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
930
931		emit_pow(pc, dst[2], tmp[1], tmp[3]);
932		emit_mov(pc, dst[2], zero);
933		set_pred(pc, 3, 0, pc->p->exec_tail);
934	}
935
936	if (mask & (1 << 1))
937		assimilate_temp(pc, dst[1], tmp[0]);
938	else
939	if (mask & (1 << 2))
940		free_temp(pc, tmp[0]);
941
942	pc->allow32 = allow32;
943
944	/* do this last, in case src[i,j] == dst[0,3] */
945	if (mask & (1 << 0))
946		emit_mov(pc, dst[0], one);
947
948	if (mask & (1 << 3))
949		emit_mov(pc, dst[3], one);
950
951	FREE(pos128);
952	FREE(neg128);
953	FREE(zero);
954	FREE(one);
955}
956
957static void
958emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
959{
960	struct nv50_program_exec *e = exec(pc);
961
962	set_long(pc, e);
963	e->inst[0] |= 0xa0000000; /* delta */
964	e->inst[1] |= (7 << 29); /* delta */
965	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
966	e->inst[1] |= (1 << 14); /* src .f32 */
967	set_dst(pc, dst, e);
968	set_src_0(pc, src, e);
969
970	emit(pc, e);
971}
972
973static void
974emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
975{
976	struct nv50_program_exec *e;
977	const int r_pred = 1;
978
979	/* Sets predicate reg ? */
980	e = exec(pc);
981	e->inst[0] = 0xa00001fd;
982	e->inst[1] = 0xc4014788;
983	set_src_0(pc, src, e);
984	set_pred_wr(pc, 1, r_pred, e);
985	if (src->neg)
986		e->inst[1] |= 0x20000000;
987	emit(pc, e);
988
989	/* This is probably KILP */
990	e = exec(pc);
991	e->inst[0] = 0x000001fe;
992	set_long(pc, e);
993	set_pred(pc, 1 /* LT? */, r_pred, e);
994	emit(pc, e);
995}
996
997static void
998emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
999	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1000{
1001	struct nv50_reg *temp, *t[4];
1002	struct nv50_program_exec *e;
1003
1004	unsigned c, mode, dim;
1005
1006	switch (type) {
1007	case TGSI_TEXTURE_1D:
1008		dim = 1;
1009		break;
1010	case TGSI_TEXTURE_UNKNOWN:
1011	case TGSI_TEXTURE_2D:
1012	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1013	case TGSI_TEXTURE_RECT:
1014		dim = 2;
1015		break;
1016	case TGSI_TEXTURE_3D:
1017	case TGSI_TEXTURE_CUBE:
1018	case TGSI_TEXTURE_SHADOW2D:
1019	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1020		dim = 3;
1021		break;
1022	default:
1023		assert(0);
1024		break;
1025	}
1026
1027	/* some cards need t[0]'s hw index to be a multiple of 4 */
1028	alloc_temp4(pc, t, 0);
1029
1030	if (proj) {
1031		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1032			mode = pc->interp_mode[src[0]->index];
1033
1034			t[3]->rhw = src[3]->rhw;
1035			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1036			emit_flop(pc, 0, t[3], t[3]);
1037
1038			for (c = 0; c < dim; c++) {
1039				t[c]->rhw = src[c]->rhw;
1040				emit_interp(pc, t[c], t[3],
1041					    (mode | INTERP_PERSPECTIVE));
1042			}
1043		} else {
1044			emit_flop(pc, 0, t[3], src[3]);
1045			for (c = 0; c < dim; c++)
1046				emit_mul(pc, t[c], src[c], t[3]);
1047
1048			/* XXX: for some reason the blob sometimes uses MAD:
1049			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1050			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1051			 */
1052		}
1053	} else {
1054		if (type == TGSI_TEXTURE_CUBE) {
1055			temp = temp_temp(pc);
1056			emit_minmax(pc, 4, temp, src[0], src[1]);
1057			emit_minmax(pc, 4, temp, temp, src[2]);
1058			emit_flop(pc, 0, temp, temp);
1059			for (c = 0; c < 3; c++)
1060				emit_mul(pc, t[c], src[c], temp);
1061		} else {
1062			for (c = 0; c < dim; c++)
1063				emit_mov(pc, t[c], src[c]);
1064		}
1065	}
1066
1067	e = exec(pc);
1068	set_long(pc, e);
1069	e->inst[0] |= 0xf0000000;
1070	e->inst[1] |= 0x00000004;
1071	set_dst(pc, t[0], e);
1072	e->inst[0] |= (unit << 9);
1073
1074	if (dim == 2)
1075		e->inst[0] |= 0x00400000;
1076	else
1077	if (dim == 3)
1078		e->inst[0] |= 0x00800000;
1079
1080	e->inst[0] |= (mask & 0x3) << 25;
1081	e->inst[1] |= (mask & 0xc) << 12;
1082
1083	emit(pc, e);
1084
1085#if 1
1086	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1087	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1088	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1089	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1090
1091	free_temp4(pc, t);
1092#else
1093	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1094	 * the texture coordinates, not the fetched values: latency ? */
1095
1096	for (c = 0; c < 4; c++) {
1097		if (mask & (1 << c))
1098			assimilate_temp(pc, dst[c], t[c]);
1099		else
1100			free_temp(pc, t[c]);
1101	}
1102#endif
1103}
1104
1105static void
1106convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1107{
1108	unsigned q = 0, m = ~0;
1109
1110	assert(!is_long(e));
1111
1112	switch (e->inst[0] >> 28) {
1113	case 0x1:
1114		/* MOV */
1115		q = 0x0403c000;
1116		m = 0xffff7fff;
1117		break;
1118	case 0x8:
1119		/* INTERP (move centroid, perspective and flat bits) */
1120		m = ~0x03000100;
1121		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1122		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1123		break;
1124	case 0x9:
1125		/* RCP */
1126		break;
1127	case 0xB:
1128		/* ADD */
1129		m = ~(127 << 16);
1130		q = ((e->inst[0] & (~m)) >> 2);
1131		break;
1132	case 0xC:
1133		/* MUL */
1134		m = ~0x00008000;
1135		q = ((e->inst[0] & (~m)) << 12);
1136		break;
1137	case 0xE:
1138		/* MAD (if src2 == dst) */
1139		q = ((e->inst[0] & 0x1fc) << 12);
1140		break;
1141	default:
1142		assert(0);
1143		break;
1144	}
1145
1146	set_long(pc, e);
1147	pc->p->exec_size++;
1148
1149	e->inst[0] &= m;
1150	e->inst[1] |= q;
1151}
1152
1153static boolean
1154negate_supported(const struct tgsi_full_instruction *insn, int i)
1155{
1156	switch (insn->Instruction.Opcode) {
1157	case TGSI_OPCODE_DP3:
1158	case TGSI_OPCODE_DP4:
1159	case TGSI_OPCODE_MUL:
1160	case TGSI_OPCODE_KIL:
1161	case TGSI_OPCODE_ADD:
1162	case TGSI_OPCODE_SUB:
1163	case TGSI_OPCODE_MAD:
1164		return TRUE;
1165	case TGSI_OPCODE_POW:
1166		return (i == 1) ? TRUE : FALSE;
1167	default:
1168		return FALSE;
1169	}
1170}
1171
1172/* Return a read mask for source registers deduced from opcode & write mask. */
1173static unsigned
1174nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1175{
1176	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1177
1178	switch (insn->Instruction.Opcode) {
1179	case TGSI_OPCODE_COS:
1180	case TGSI_OPCODE_SIN:
1181		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1182	case TGSI_OPCODE_DP3:
1183		return 0x7;
1184	case TGSI_OPCODE_DP4:
1185	case TGSI_OPCODE_DPH:
1186	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1187		return 0xf;
1188	case TGSI_OPCODE_DST:
1189		return mask & (c ? 0xa : 0x6);
1190	case TGSI_OPCODE_EX2:
1191	case TGSI_OPCODE_LG2:
1192	case TGSI_OPCODE_POW:
1193	case TGSI_OPCODE_RCP:
1194	case TGSI_OPCODE_RSQ:
1195	case TGSI_OPCODE_SCS:
1196		return 0x1;
1197	case TGSI_OPCODE_LIT:
1198		return 0xb;
1199	case TGSI_OPCODE_TEX:
1200	case TGSI_OPCODE_TXP:
1201	{
1202		const struct tgsi_instruction_ext_texture *tex;
1203
1204		assert(insn->Instruction.Extended);
1205		tex = &insn->InstructionExtTexture;
1206
1207		mask = 0x7;
1208		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1209			mask |= 0x8;
1210
1211		switch (tex->Texture) {
1212		case TGSI_TEXTURE_1D:
1213			mask &= 0x9;
1214			break;
1215		case TGSI_TEXTURE_2D:
1216			mask &= 0xb;
1217			break;
1218		default:
1219			break;
1220		}
1221	}
1222		return mask;
1223	case TGSI_OPCODE_XPD:
1224		x = 0;
1225		if (mask & 1) x |= 0x6;
1226		if (mask & 2) x |= 0x5;
1227		if (mask & 4) x |= 0x3;
1228		return x;
1229	default:
1230		break;
1231	}
1232
1233	return mask;
1234}
1235
1236static struct nv50_reg *
1237tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1238{
1239	switch (dst->DstRegister.File) {
1240	case TGSI_FILE_TEMPORARY:
1241		return &pc->temp[dst->DstRegister.Index * 4 + c];
1242	case TGSI_FILE_OUTPUT:
1243		return &pc->result[dst->DstRegister.Index * 4 + c];
1244	case TGSI_FILE_NULL:
1245		return NULL;
1246	default:
1247		break;
1248	}
1249
1250	return NULL;
1251}
1252
1253static struct nv50_reg *
1254tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1255	 boolean neg)
1256{
1257	struct nv50_reg *r = NULL;
1258	struct nv50_reg *temp;
1259	unsigned sgn, c;
1260
1261	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1262
1263	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1264	switch (c) {
1265	case TGSI_EXTSWIZZLE_X:
1266	case TGSI_EXTSWIZZLE_Y:
1267	case TGSI_EXTSWIZZLE_Z:
1268	case TGSI_EXTSWIZZLE_W:
1269		switch (src->SrcRegister.File) {
1270		case TGSI_FILE_INPUT:
1271			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1272			break;
1273		case TGSI_FILE_TEMPORARY:
1274			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1275			break;
1276		case TGSI_FILE_CONSTANT:
1277			r = &pc->param[src->SrcRegister.Index * 4 + c];
1278			break;
1279		case TGSI_FILE_IMMEDIATE:
1280			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1281			break;
1282		case TGSI_FILE_SAMPLER:
1283			break;
1284		default:
1285			assert(0);
1286			break;
1287		}
1288		break;
1289	case TGSI_EXTSWIZZLE_ZERO:
1290		r = alloc_immd(pc, 0.0);
1291		return r;
1292	case TGSI_EXTSWIZZLE_ONE:
1293		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1294			return alloc_immd(pc, -1.0);
1295		return alloc_immd(pc, 1.0);
1296	default:
1297		assert(0);
1298		break;
1299	}
1300
1301	switch (sgn) {
1302	case TGSI_UTIL_SIGN_KEEP:
1303		break;
1304	case TGSI_UTIL_SIGN_CLEAR:
1305		temp = temp_temp(pc);
1306		emit_abs(pc, temp, r);
1307		r = temp;
1308		break;
1309	case TGSI_UTIL_SIGN_TOGGLE:
1310		if (neg)
1311			r->neg = 1;
1312		else {
1313			temp = temp_temp(pc);
1314			emit_neg(pc, temp, r);
1315			r = temp;
1316		}
1317		break;
1318	case TGSI_UTIL_SIGN_SET:
1319		temp = temp_temp(pc);
1320		emit_abs(pc, temp, r);
1321		if (neg)
1322			temp->neg = 1;
1323		else
1324			emit_neg(pc, temp, temp);
1325		r = temp;
1326		break;
1327	default:
1328		assert(0);
1329		break;
1330	}
1331
1332	return r;
1333}
1334
1335/* return TRUE for ops that produce only a single result */
1336static boolean
1337is_scalar_op(unsigned op)
1338{
1339	switch (op) {
1340	case TGSI_OPCODE_DP2:
1341	case TGSI_OPCODE_DP3:
1342	case TGSI_OPCODE_DP4:
1343	case TGSI_OPCODE_DPH:
1344	case TGSI_OPCODE_EX2:
1345	case TGSI_OPCODE_LG2:
1346	case TGSI_OPCODE_POW:
1347	case TGSI_OPCODE_RCP:
1348	case TGSI_OPCODE_RSQ:
1349		/*
1350	case TGSI_OPCODE_COS:
1351	case TGSI_OPCODE_KIL:
1352	case TGSI_OPCODE_LIT:
1353	case TGSI_OPCODE_SCS:
1354	case TGSI_OPCODE_SIN:
1355		*/
1356		return TRUE;
1357	default:
1358		return FALSE;
1359	}
1360}
1361
1362/* Returns a bitmask indicating which dst components depend
1363 * on source s, component c (reverse of nv50_tgsi_src_mask).
1364 */
1365static unsigned
1366nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1367{
1368	if (is_scalar_op(op))
1369		return 0x1;
1370
1371	switch (op) {
1372	case TGSI_OPCODE_DST:
1373		return (1 << c) & (s ? 0xa : 0x6);
1374	case TGSI_OPCODE_XPD:
1375		switch (c) {
1376		case 0: return 0x6;
1377		case 1: return 0x5;
1378		case 2: return 0x3;
1379		case 3: return 0x0;
1380		default:
1381			assert(0);
1382			return 0x0;
1383		}
1384	case TGSI_OPCODE_LIT:
1385	case TGSI_OPCODE_SCS:
1386	case TGSI_OPCODE_TEX:
1387	case TGSI_OPCODE_TXP:
1388		/* these take care of dangerous swizzles themselves */
1389		return 0x0;
1390	case TGSI_OPCODE_IF:
1391	case TGSI_OPCODE_KIL:
1392		/* don't call this function for these ops */
1393		assert(0);
1394		return 0;
1395	default:
1396		/* linear vector instruction */
1397		return (1 << c);
1398	}
1399}
1400
1401static boolean
1402nv50_program_tx_insn(struct nv50_pc *pc,
1403		     const struct tgsi_full_instruction *inst)
1404{
1405	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1406	unsigned mask, sat, unit;
1407	int i, c;
1408
1409	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1410	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1411
1412	memset(src, 0, sizeof(src));
1413
1414	for (c = 0; c < 4; c++) {
1415		if ((mask & (1 << c)) && !pc->r_dst[c])
1416			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1417		else
1418			dst[c] = pc->r_dst[c];
1419		rdst[c] = dst[c];
1420	}
1421
1422	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1423		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1424		unsigned src_mask;
1425		boolean neg_supp;
1426
1427		src_mask = nv50_tgsi_src_mask(inst, i);
1428		neg_supp = negate_supported(inst, i);
1429
1430		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1431			unit = fs->SrcRegister.Index;
1432
1433		for (c = 0; c < 4; c++)
1434			if (src_mask & (1 << c))
1435				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1436	}
1437
1438	brdc = temp = pc->r_brdc;
1439	if (brdc && brdc->type != P_TEMP) {
1440		temp = temp_temp(pc);
1441		if (sat)
1442			brdc = temp;
1443	} else
1444	if (sat) {
1445		for (c = 0; c < 4; c++) {
1446			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1447				continue;
1448			rdst[c] = dst[c];
1449			dst[c] = temp_temp(pc);
1450		}
1451	}
1452
1453	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1454
1455	switch (inst->Instruction.Opcode) {
1456	case TGSI_OPCODE_ABS:
1457		for (c = 0; c < 4; c++) {
1458			if (!(mask & (1 << c)))
1459				continue;
1460			emit_abs(pc, dst[c], src[0][c]);
1461		}
1462		break;
1463	case TGSI_OPCODE_ADD:
1464		for (c = 0; c < 4; c++) {
1465			if (!(mask & (1 << c)))
1466				continue;
1467			emit_add(pc, dst[c], src[0][c], src[1][c]);
1468		}
1469		break;
1470	case TGSI_OPCODE_COS:
1471		temp = temp_temp(pc);
1472		emit_precossin(pc, temp, src[0][0]);
1473		emit_flop(pc, 5, temp, temp);
1474		for (c = 0; c < 4; c++) {
1475			if (!(mask & (1 << c)))
1476				continue;
1477			emit_mov(pc, dst[c], temp);
1478		}
1479		break;
1480	case TGSI_OPCODE_DP3:
1481		emit_mul(pc, temp, src[0][0], src[1][0]);
1482		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1483		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1484		break;
1485	case TGSI_OPCODE_DP4:
1486		emit_mul(pc, temp, src[0][0], src[1][0]);
1487		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1488		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1489		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1490		break;
1491	case TGSI_OPCODE_DPH:
1492		emit_mul(pc, temp, src[0][0], src[1][0]);
1493		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1494		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1495		emit_add(pc, brdc, src[1][3], temp);
1496		break;
1497	case TGSI_OPCODE_DST:
1498		if (mask & (1 << 1))
1499			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1500		if (mask & (1 << 2))
1501			emit_mov(pc, dst[2], src[0][2]);
1502		if (mask & (1 << 3))
1503			emit_mov(pc, dst[3], src[1][3]);
1504		if (mask & (1 << 0))
1505			emit_mov_immdval(pc, dst[0], 1.0f);
1506		break;
1507	case TGSI_OPCODE_EX2:
1508		emit_preex2(pc, temp, src[0][0]);
1509		emit_flop(pc, 6, brdc, temp);
1510		break;
1511	case TGSI_OPCODE_FLR:
1512		for (c = 0; c < 4; c++) {
1513			if (!(mask & (1 << c)))
1514				continue;
1515			emit_flr(pc, dst[c], src[0][c]);
1516		}
1517		break;
1518	case TGSI_OPCODE_FRC:
1519		temp = temp_temp(pc);
1520		for (c = 0; c < 4; c++) {
1521			if (!(mask & (1 << c)))
1522				continue;
1523			emit_flr(pc, temp, src[0][c]);
1524			emit_sub(pc, dst[c], src[0][c], temp);
1525		}
1526		break;
1527	case TGSI_OPCODE_KIL:
1528		emit_kil(pc, src[0][0]);
1529		emit_kil(pc, src[0][1]);
1530		emit_kil(pc, src[0][2]);
1531		emit_kil(pc, src[0][3]);
1532		pc->p->cfg.fp.regs[2] |= 0x00100000;
1533		break;
1534	case TGSI_OPCODE_LIT:
1535		emit_lit(pc, &dst[0], mask, &src[0][0]);
1536		break;
1537	case TGSI_OPCODE_LG2:
1538		emit_flop(pc, 3, brdc, src[0][0]);
1539		break;
1540	case TGSI_OPCODE_LRP:
1541		temp = temp_temp(pc);
1542		for (c = 0; c < 4; c++) {
1543			if (!(mask & (1 << c)))
1544				continue;
1545			emit_sub(pc, temp, src[1][c], src[2][c]);
1546			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1547		}
1548		break;
1549	case TGSI_OPCODE_MAD:
1550		for (c = 0; c < 4; c++) {
1551			if (!(mask & (1 << c)))
1552				continue;
1553			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1554		}
1555		break;
1556	case TGSI_OPCODE_MAX:
1557		for (c = 0; c < 4; c++) {
1558			if (!(mask & (1 << c)))
1559				continue;
1560			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1561		}
1562		break;
1563	case TGSI_OPCODE_MIN:
1564		for (c = 0; c < 4; c++) {
1565			if (!(mask & (1 << c)))
1566				continue;
1567			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1568		}
1569		break;
1570	case TGSI_OPCODE_MOV:
1571	case TGSI_OPCODE_SWZ:
1572		for (c = 0; c < 4; c++) {
1573			if (!(mask & (1 << c)))
1574				continue;
1575			emit_mov(pc, dst[c], src[0][c]);
1576		}
1577		break;
1578	case TGSI_OPCODE_MUL:
1579		for (c = 0; c < 4; c++) {
1580			if (!(mask & (1 << c)))
1581				continue;
1582			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1583		}
1584		break;
1585	case TGSI_OPCODE_POW:
1586		emit_pow(pc, brdc, src[0][0], src[1][0]);
1587		break;
1588	case TGSI_OPCODE_RCP:
1589		emit_flop(pc, 0, brdc, src[0][0]);
1590		break;
1591	case TGSI_OPCODE_RSQ:
1592		emit_flop(pc, 2, brdc, src[0][0]);
1593		break;
1594	case TGSI_OPCODE_SCS:
1595		temp = temp_temp(pc);
1596		if (mask & 3)
1597			emit_precossin(pc, temp, src[0][0]);
1598		if (mask & (1 << 0))
1599			emit_flop(pc, 5, dst[0], temp);
1600		if (mask & (1 << 1))
1601			emit_flop(pc, 4, dst[1], temp);
1602		if (mask & (1 << 2))
1603			emit_mov_immdval(pc, dst[2], 0.0);
1604		if (mask & (1 << 3))
1605			emit_mov_immdval(pc, dst[3], 1.0);
1606		break;
1607	case TGSI_OPCODE_SGE:
1608		for (c = 0; c < 4; c++) {
1609			if (!(mask & (1 << c)))
1610				continue;
1611			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1612		}
1613		break;
1614	case TGSI_OPCODE_SIN:
1615		temp = temp_temp(pc);
1616		emit_precossin(pc, temp, src[0][0]);
1617		emit_flop(pc, 4, temp, temp);
1618		for (c = 0; c < 4; c++) {
1619			if (!(mask & (1 << c)))
1620				continue;
1621			emit_mov(pc, dst[c], temp);
1622		}
1623		break;
1624	case TGSI_OPCODE_SLT:
1625		for (c = 0; c < 4; c++) {
1626			if (!(mask & (1 << c)))
1627				continue;
1628			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1629		}
1630		break;
1631	case TGSI_OPCODE_SUB:
1632		for (c = 0; c < 4; c++) {
1633			if (!(mask & (1 << c)))
1634				continue;
1635			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1636		}
1637		break;
1638	case TGSI_OPCODE_TEX:
1639		emit_tex(pc, dst, mask, src[0], unit,
1640			 inst->InstructionExtTexture.Texture, FALSE);
1641		break;
1642	case TGSI_OPCODE_TXP:
1643		emit_tex(pc, dst, mask, src[0], unit,
1644			 inst->InstructionExtTexture.Texture, TRUE);
1645		break;
1646	case TGSI_OPCODE_XPD:
1647		temp = temp_temp(pc);
1648		if (mask & (1 << 0)) {
1649			emit_mul(pc, temp, src[0][2], src[1][1]);
1650			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1651		}
1652		if (mask & (1 << 1)) {
1653			emit_mul(pc, temp, src[0][0], src[1][2]);
1654			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1655		}
1656		if (mask & (1 << 2)) {
1657			emit_mul(pc, temp, src[0][1], src[1][0]);
1658			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1659		}
1660		if (mask & (1 << 3))
1661			emit_mov_immdval(pc, dst[3], 1.0);
1662		break;
1663	case TGSI_OPCODE_END:
1664		break;
1665	default:
1666		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1667		return FALSE;
1668	}
1669
1670	if (brdc) {
1671		if (sat)
1672			emit_sat(pc, brdc, brdc);
1673		for (c = 0; c < 4; c++)
1674			if ((mask & (1 << c)) && dst[c] != brdc)
1675				emit_mov(pc, dst[c], brdc);
1676	} else
1677	if (sat) {
1678		for (c = 0; c < 4; c++) {
1679			if (!(mask & (1 << c)))
1680				continue;
1681			/* in this case we saturate later */
1682			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1683				continue;
1684			emit_sat(pc, rdst[c], dst[c]);
1685		}
1686	}
1687
1688	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1689		for (c = 0; c < 4; c++) {
1690			if (!src[i][c])
1691				continue;
1692			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1693				FREE(src[i][c]);
1694		}
1695	}
1696
1697	kill_temp_temp(pc);
1698	return TRUE;
1699}
1700
1701static void
1702prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1703		  unsigned *r_usage[2])
1704{
1705	const struct tgsi_full_instruction *insn;
1706	const struct tgsi_full_src_register *src;
1707	const struct tgsi_dst_register *dst;
1708
1709	unsigned i, c, k, n, mask, *acc_p;
1710
1711	insn = &tok->FullInstruction;
1712	dst = &insn->FullDstRegisters[0].DstRegister;
1713	mask = dst->WriteMask;
1714
1715	if (!r_usage[0])
1716		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1717	if (!r_usage[1])
1718		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1719
1720	if (dst->File == TGSI_FILE_TEMPORARY) {
1721		for (c = 0; c < 4; c++) {
1722			if (!(mask & (1 << c)))
1723				continue;
1724			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1725		}
1726	}
1727
1728	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1729		src = &insn->FullSrcRegisters[i];
1730
1731		switch (src->SrcRegister.File) {
1732		case TGSI_FILE_TEMPORARY:
1733			acc_p = r_usage[0];
1734			break;
1735		case TGSI_FILE_INPUT:
1736			acc_p = r_usage[1];
1737			break;
1738		default:
1739			continue;
1740		}
1741
1742		mask = nv50_tgsi_src_mask(insn, i);
1743
1744		for (c = 0; c < 4; c++) {
1745			if (!(mask & (1 << c)))
1746				continue;
1747
1748			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1749			switch (k) {
1750			case TGSI_EXTSWIZZLE_X:
1751			case TGSI_EXTSWIZZLE_Y:
1752			case TGSI_EXTSWIZZLE_Z:
1753			case TGSI_EXTSWIZZLE_W:
1754				n = src->SrcRegister.Index * 4 + k;
1755				acc_p[n] = pc->insn_nr;
1756				break;
1757			default:
1758				break;
1759			}
1760		}
1761	}
1762}
1763
1764/* Returns a bitmask indicating which dst components need to be
1765 * written to temporaries first to avoid 'corrupting' sources.
1766 *
1767 * m[i]   (out) indicate component to write in the i-th position
1768 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1769 */
1770static unsigned
1771nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1772{
1773	unsigned i, c, x, unsafe;
1774
1775	for (c = 0; c < 4; c++)
1776		m[c] = c;
1777
1778	/* Swap as long as a dst component written earlier is depended on
1779	 * by one written later, but the next one isn't depended on by it.
1780	 */
1781	for (c = 0; c < 3; c++) {
1782		if (rdep[m[c + 1]] & (1 << m[c]))
1783			continue; /* if next one is depended on by us */
1784		for (i = c + 1; i < 4; i++)
1785			/* if we are depended on by a later one */
1786			if (rdep[m[c]] & (1 << m[i]))
1787				break;
1788		if (i == 4)
1789			continue;
1790		/* now, swap */
1791		x = m[c];
1792		m[c] = m[c + 1];
1793		m[c + 1] = x;
1794
1795		/* restart */
1796		c = 0;
1797	}
1798
1799	/* mark dependencies that could not be resolved by reordering */
1800	for (i = 0; i < 3; ++i)
1801		for (c = i + 1; c < 4; ++c)
1802			if (rdep[m[i]] & (1 << m[c]))
1803				unsafe |= (1 << i);
1804
1805	/* NOTE: $unsafe is with respect to order, not component */
1806	return unsafe;
1807}
1808
1809/* Select a suitable dst register for broadcasting scalar results,
1810 * or return NULL if we have to allocate an extra TEMP.
1811 *
1812 * If e.g. only 1 component is written, we may also emit the final
1813 * result to a write-only register.
1814 */
1815static struct nv50_reg *
1816tgsi_broadcast_dst(struct nv50_pc *pc,
1817		   const struct tgsi_full_dst_register *fd, unsigned mask)
1818{
1819	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1820		int c = ffs(~mask & fd->DstRegister.WriteMask);
1821		if (c)
1822			return tgsi_dst(pc, c - 1, fd);
1823	} else {
1824		int c = ffs(fd->DstRegister.WriteMask) - 1;
1825		if ((1 << c) == fd->DstRegister.WriteMask)
1826			return tgsi_dst(pc, c, fd);
1827	}
1828
1829	return NULL;
1830}
1831
1832/* Scan source swizzles and return a bitmask indicating dst regs that
1833 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1834 */
1835static unsigned
1836nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1837		       unsigned rdep[4])
1838{
1839	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1840	const struct tgsi_full_src_register *fs;
1841	unsigned i, deqs = 0;
1842
1843	for (i = 0; i < 4; ++i)
1844		rdep[i] = 0;
1845
1846	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1847		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1848		boolean neg_supp = negate_supported(insn, i);
1849
1850		fs = &insn->FullSrcRegisters[i];
1851		if (fs->SrcRegister.File != fd->DstRegister.File ||
1852		    fs->SrcRegister.Index != fd->DstRegister.Index)
1853			continue;
1854
1855		for (chn = 0; chn < 4; ++chn) {
1856			unsigned s, c;
1857
1858			if (!(mask & (1 << chn))) /* src is not read */
1859				continue;
1860			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1861			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1862
1863			if (c > TGSI_EXTSWIZZLE_W ||
1864			    !(fd->DstRegister.WriteMask & (1 << c)))
1865				continue;
1866
1867			/* no danger if src is copied to TEMP first */
1868			if ((s != TGSI_UTIL_SIGN_KEEP) &&
1869			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1870				continue;
1871
1872			rdep[c] |= nv50_tgsi_dst_revdep(
1873				insn->Instruction.Opcode, i, chn);
1874			deqs |= (1 << c);
1875		}
1876	}
1877
1878	return deqs;
1879}
1880
1881static boolean
1882nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1883{
1884	struct tgsi_full_instruction insn = tok->FullInstruction;
1885	const struct tgsi_full_dst_register *fd;
1886	unsigned i, deqs, rdep[4], m[4];
1887
1888	fd = &tok->FullInstruction.FullDstRegisters[0];
1889	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
1890
1891	if (is_scalar_op(insn.Instruction.Opcode)) {
1892		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
1893		if (!pc->r_brdc)
1894			pc->r_brdc = temp_temp(pc);
1895		return nv50_program_tx_insn(pc, &insn);
1896	}
1897	pc->r_brdc = NULL;
1898
1899	if (!deqs)
1900		return nv50_program_tx_insn(pc, &insn);
1901
1902	deqs = nv50_revdep_reorder(m, rdep);
1903
1904	for (i = 0; i < 4; ++i) {
1905		assert(pc->r_dst[m[i]] == NULL);
1906
1907		insn.FullDstRegisters[0].DstRegister.WriteMask =
1908			fd->DstRegister.WriteMask & (1 << m[i]);
1909
1910		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
1911			continue;
1912
1913		if (deqs & (1 << i))
1914			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
1915
1916		if (!nv50_program_tx_insn(pc, &insn))
1917			return FALSE;
1918	}
1919
1920	for (i = 0; i < 4; i++) {
1921		struct nv50_reg *reg = pc->r_dst[i];
1922		if (!reg)
1923			continue;
1924		pc->r_dst[i] = NULL;
1925
1926		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
1927			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
1928		else
1929			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
1930		free_temp(pc, reg);
1931	}
1932
1933	return TRUE;
1934}
1935
1936static unsigned
1937load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1938	       int *aid, int *p_oid)
1939{
1940	struct nv50_reg *iv;
1941	int oid, c, n;
1942	unsigned mask = 0;
1943
1944	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1945
1946	for (c = 0, n = i * 4; c < 4; c++, n++) {
1947		oid = (*p_oid)++;
1948		pc->attr[n].type = P_TEMP;
1949		pc->attr[n].index = i;
1950
1951		if (pc->attr[n].acc == acc[n])
1952			continue;
1953		mask |= (1 << c);
1954
1955		pc->attr[n].acc = acc[n];
1956		pc->attr[n].rhw = pc->attr[n].hw = -1;
1957		alloc_reg(pc, &pc->attr[n]);
1958
1959		pc->attr[n].rhw = (*aid)++;
1960		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1961
1962		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1963		(*mid)++;
1964		pc->p->cfg.fp.regs[1] += 0x00010001;
1965	}
1966
1967	return mask;
1968}
1969
1970static boolean
1971nv50_program_tx_prep(struct nv50_pc *pc)
1972{
1973	struct tgsi_parse_context p;
1974	boolean ret = FALSE;
1975	unsigned i, c;
1976	unsigned fcol, bcol, fcrd, depr;
1977
1978	/* count (centroid) perspective interpolations */
1979	unsigned centroid_loads = 0;
1980	unsigned perspect_loads = 0;
1981
1982	/* track register access for temps and attrs */
1983	unsigned *r_usage[2];
1984	r_usage[0] = NULL;
1985	r_usage[1] = NULL;
1986
1987	depr = fcol = bcol = fcrd = 0xffff;
1988
1989	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1990		pc->p->cfg.fp.regs[0] = 0x01000404;
1991		pc->p->cfg.fp.regs[1] = 0x00000400;
1992	}
1993
1994	tgsi_parse_init(&p, pc->p->pipe.tokens);
1995	while (!tgsi_parse_end_of_tokens(&p)) {
1996		const union tgsi_full_token *tok = &p.FullToken;
1997
1998		tgsi_parse_token(&p);
1999		switch (tok->Token.Type) {
2000		case TGSI_TOKEN_TYPE_IMMEDIATE:
2001		{
2002			const struct tgsi_full_immediate *imm =
2003				&p.FullToken.FullImmediate;
2004
2005			ctor_immd(pc, imm->u[0].Float,
2006				      imm->u[1].Float,
2007				      imm->u[2].Float,
2008				      imm->u[3].Float);
2009		}
2010			break;
2011		case TGSI_TOKEN_TYPE_DECLARATION:
2012		{
2013			const struct tgsi_full_declaration *d;
2014			unsigned last, first, mode;
2015
2016			d = &p.FullToken.FullDeclaration;
2017			first = d->DeclarationRange.First;
2018			last = d->DeclarationRange.Last;
2019
2020			switch (d->Declaration.File) {
2021			case TGSI_FILE_TEMPORARY:
2022				if (pc->temp_nr < (last + 1))
2023					pc->temp_nr = last + 1;
2024				break;
2025			case TGSI_FILE_OUTPUT:
2026				if (pc->result_nr < (last + 1))
2027					pc->result_nr = last + 1;
2028
2029				if (!d->Declaration.Semantic)
2030					break;
2031
2032				switch (d->Semantic.SemanticName) {
2033				case TGSI_SEMANTIC_POSITION:
2034					depr = first;
2035					pc->p->cfg.fp.regs[2] |= 0x00000100;
2036					pc->p->cfg.fp.regs[3] |= 0x00000011;
2037					break;
2038				default:
2039					break;
2040				}
2041
2042				break;
2043			case TGSI_FILE_INPUT:
2044			{
2045				if (pc->attr_nr < (last + 1))
2046					pc->attr_nr = last + 1;
2047
2048				if (pc->p->type != PIPE_SHADER_FRAGMENT)
2049					break;
2050
2051				switch (d->Declaration.Interpolate) {
2052				case TGSI_INTERPOLATE_CONSTANT:
2053					mode = INTERP_FLAT;
2054					break;
2055				case TGSI_INTERPOLATE_PERSPECTIVE:
2056					mode = INTERP_PERSPECTIVE;
2057					break;
2058				default:
2059					mode = INTERP_LINEAR;
2060					break;
2061				}
2062
2063				if (d->Declaration.Semantic) {
2064					switch (d->Semantic.SemanticName) {
2065					case TGSI_SEMANTIC_POSITION:
2066						fcrd = first;
2067						break;
2068					case TGSI_SEMANTIC_COLOR:
2069						fcol = first;
2070						mode = INTERP_PERSPECTIVE;
2071						break;
2072					case TGSI_SEMANTIC_BCOLOR:
2073						bcol = first;
2074						mode = INTERP_PERSPECTIVE;
2075						break;
2076					}
2077				}
2078
2079				if (d->Declaration.Centroid) {
2080					mode |= INTERP_CENTROID;
2081					if (mode & INTERP_PERSPECTIVE)
2082						centroid_loads++;
2083				} else
2084				if (mode & INTERP_PERSPECTIVE)
2085					perspect_loads++;
2086
2087				assert(last < 32);
2088				for (i = first; i <= last; i++)
2089					pc->interp_mode[i] = mode;
2090			}
2091				break;
2092			case TGSI_FILE_CONSTANT:
2093				if (pc->param_nr < (last + 1))
2094					pc->param_nr = last + 1;
2095				break;
2096			case TGSI_FILE_SAMPLER:
2097				break;
2098			default:
2099				NOUVEAU_ERR("bad decl file %d\n",
2100					    d->Declaration.File);
2101				goto out_err;
2102			}
2103		}
2104			break;
2105		case TGSI_TOKEN_TYPE_INSTRUCTION:
2106			pc->insn_nr++;
2107			prep_inspect_insn(pc, tok, r_usage);
2108			break;
2109		default:
2110			break;
2111		}
2112	}
2113
2114	if (pc->temp_nr) {
2115		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
2116		if (!pc->temp)
2117			goto out_err;
2118
2119		for (i = 0; i < pc->temp_nr; i++) {
2120			for (c = 0; c < 4; c++) {
2121				pc->temp[i*4+c].type = P_TEMP;
2122				pc->temp[i*4+c].hw = -1;
2123				pc->temp[i*4+c].rhw = -1;
2124				pc->temp[i*4+c].index = i;
2125				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
2126			}
2127		}
2128	}
2129
2130	if (pc->attr_nr) {
2131		int oid = 4, mid = 4, aid = 0;
2132		/* oid = VP output id
2133		 * aid = FP attribute/interpolant id
2134		 * mid = VP output mapping field ID
2135		 */
2136
2137		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
2138		if (!pc->attr)
2139			goto out_err;
2140
2141		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2142			/* position should be loaded first */
2143			if (fcrd != 0xffff) {
2144				unsigned mask;
2145				mid = 0;
2146				mask = load_fp_attrib(pc, fcrd, r_usage[1],
2147						      &mid, &aid, &oid);
2148				oid = 0;
2149				pc->p->cfg.fp.regs[1] |= (mask << 24);
2150				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
2151			}
2152			pc->p->cfg.fp.map[0] += 0x03020100;
2153
2154			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
2155
2156			if (perspect_loads) {
2157				pc->iv_p = alloc_temp(pc, NULL);
2158
2159				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
2160					pc->p->cfg.fp.regs[1] |= 0x08000000;
2161					pc->iv_p->rhw = aid++;
2162					emit_interp(pc, pc->iv_p, NULL,
2163						    INTERP_LINEAR);
2164					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
2165				} else {
2166					pc->iv_p->rhw = aid - 1;
2167					emit_flop(pc, 0, pc->iv_p,
2168						  &pc->attr[fcrd * 4 + 3]);
2169				}
2170			}
2171
2172			if (centroid_loads) {
2173				pc->iv_c = alloc_temp(pc, NULL);
2174				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
2175				emit_interp(pc, pc->iv_c, NULL,
2176					    INTERP_CENTROID);
2177				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
2178				pc->p->cfg.fp.regs[1] |= 0x08000000;
2179			}
2180
2181			for (c = 0; c < 4; c++) {
2182				/* I don't know what these values do, but
2183				 * let's set them like the blob does:
2184				 */
2185				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
2186					pc->p->cfg.fp.regs[0] += 0x00010000;
2187				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
2188					pc->p->cfg.fp.regs[0] += 0x00010000;
2189			}
2190
2191			for (i = 0; i < pc->attr_nr; i++)
2192				load_fp_attrib(pc, i, r_usage[1],
2193					       &mid, &aid, &oid);
2194
2195			if (pc->iv_p)
2196				free_temp(pc, pc->iv_p);
2197			if (pc->iv_c)
2198				free_temp(pc, pc->iv_c);
2199
2200			pc->p->cfg.fp.high_map = (mid / 4);
2201			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
2202		} else {
2203			/* vertex program */
2204			for (i = 0; i < pc->attr_nr * 4; i++) {
2205				pc->p->cfg.vp.attr[aid / 32] |=
2206					(1 << (aid % 32));
2207				pc->attr[i].type = P_ATTR;
2208				pc->attr[i].hw = aid++;
2209				pc->attr[i].index = i / 4;
2210			}
2211		}
2212	}
2213
2214	if (pc->result_nr) {
2215		int rid = 0;
2216
2217		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
2218		if (!pc->result)
2219			goto out_err;
2220
2221		for (i = 0; i < pc->result_nr; i++) {
2222			for (c = 0; c < 4; c++) {
2223				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2224					pc->result[i*4+c].type = P_TEMP;
2225					pc->result[i*4+c].hw = -1;
2226					pc->result[i*4+c].rhw = (i == depr) ?
2227						-1 : rid++;
2228				} else {
2229					pc->result[i*4+c].type = P_RESULT;
2230					pc->result[i*4+c].hw = rid++;
2231				}
2232				pc->result[i*4+c].index = i;
2233			}
2234
2235			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2236			    depr != 0xffff) {
2237				pc->result[depr * 4 + 2].rhw =
2238					(pc->result_nr - 1) * 4;
2239			}
2240		}
2241	}
2242
2243	if (pc->param_nr) {
2244		int rid = 0;
2245
2246		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2247		if (!pc->param)
2248			goto out_err;
2249
2250		for (i = 0; i < pc->param_nr; i++) {
2251			for (c = 0; c < 4; c++) {
2252				pc->param[i*4+c].type = P_CONST;
2253				pc->param[i*4+c].hw = rid++;
2254				pc->param[i*4+c].index = i;
2255			}
2256		}
2257	}
2258
2259	if (pc->immd_nr) {
2260		int rid = 0;
2261
2262		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2263		if (!pc->immd)
2264			goto out_err;
2265
2266		for (i = 0; i < pc->immd_nr; i++) {
2267			for (c = 0; c < 4; c++) {
2268				pc->immd[i*4+c].type = P_IMMD;
2269				pc->immd[i*4+c].hw = rid++;
2270				pc->immd[i*4+c].index = i;
2271			}
2272		}
2273	}
2274
2275	ret = TRUE;
2276out_err:
2277	if (r_usage[0])
2278		FREE(r_usage[0]);
2279	if (r_usage[1])
2280		FREE(r_usage[1]);
2281
2282	tgsi_parse_free(&p);
2283	return ret;
2284}
2285
2286static void
2287free_nv50_pc(struct nv50_pc *pc)
2288{
2289	if (pc->immd)
2290		FREE(pc->immd);
2291	if (pc->param)
2292		FREE(pc->param);
2293	if (pc->result)
2294		FREE(pc->result);
2295	if (pc->attr)
2296		FREE(pc->attr);
2297	if (pc->temp)
2298		FREE(pc->temp);
2299
2300	FREE(pc);
2301}
2302
2303static boolean
2304nv50_program_tx(struct nv50_program *p)
2305{
2306	struct tgsi_parse_context parse;
2307	struct nv50_pc *pc;
2308	unsigned k;
2309	boolean ret;
2310
2311	pc = CALLOC_STRUCT(nv50_pc);
2312	if (!pc)
2313		return FALSE;
2314	pc->p = p;
2315	pc->p->cfg.high_temp = 4;
2316
2317	ret = nv50_program_tx_prep(pc);
2318	if (ret == FALSE)
2319		goto out_cleanup;
2320
2321	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2322	while (!tgsi_parse_end_of_tokens(&parse)) {
2323		const union tgsi_full_token *tok = &parse.FullToken;
2324
2325		/* don't allow half insn/immd on first and last instruction */
2326		pc->allow32 = TRUE;
2327		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2328			pc->allow32 = FALSE;
2329
2330		tgsi_parse_token(&parse);
2331
2332		switch (tok->Token.Type) {
2333		case TGSI_TOKEN_TYPE_INSTRUCTION:
2334			++pc->insn_cur;
2335			ret = nv50_tgsi_insn(pc, tok);
2336			if (ret == FALSE)
2337				goto out_err;
2338			break;
2339		default:
2340			break;
2341		}
2342	}
2343
2344	if (p->type == PIPE_SHADER_FRAGMENT) {
2345		struct nv50_reg out;
2346
2347		out.type = P_TEMP;
2348		for (k = 0; k < pc->result_nr * 4; k++) {
2349			if (pc->result[k].rhw == -1)
2350				continue;
2351			if (pc->result[k].hw != pc->result[k].rhw) {
2352				out.hw = pc->result[k].rhw;
2353				emit_mov(pc, &out, &pc->result[k]);
2354			}
2355			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2356				pc->p->cfg.high_result = pc->result[k].rhw + 1;
2357		}
2358	}
2359
2360	/* look for single half instructions and make them long */
2361	struct nv50_program_exec *e, *e_prev;
2362
2363	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2364		if (!is_long(e))
2365			k++;
2366
2367		if (!e->next || is_long(e->next)) {
2368			if (k & 1)
2369				convert_to_long(pc, e);
2370			k = 0;
2371		}
2372
2373		if (e->next)
2374			e_prev = e;
2375	}
2376
2377	if (!is_long(pc->p->exec_tail)) {
2378		/* this may occur if moving FP results */
2379		assert(e_prev && !is_long(e_prev));
2380		convert_to_long(pc, e_prev);
2381		convert_to_long(pc, pc->p->exec_tail);
2382	}
2383
2384	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2385	pc->p->exec_tail->inst[1] |= 0x00000001;
2386
2387	p->param_nr = pc->param_nr * 4;
2388	p->immd_nr = pc->immd_nr * 4;
2389	p->immd = pc->immd_buf;
2390
2391out_err:
2392	tgsi_parse_free(&parse);
2393
2394out_cleanup:
2395	free_nv50_pc(pc);
2396	return ret;
2397}
2398
2399static void
2400nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2401{
2402	if (nv50_program_tx(p) == FALSE)
2403		assert(0);
2404	p->translated = TRUE;
2405}
2406
2407static void
2408nv50_program_upload_data(struct nv50_context *nv50, float *map,
2409			unsigned start, unsigned count, unsigned cbuf)
2410{
2411	struct nouveau_channel *chan = nv50->screen->base.channel;
2412	struct nouveau_grobj *tesla = nv50->screen->tesla;
2413
2414	while (count) {
2415		unsigned nr = count > 2047 ? 2047 : count;
2416
2417		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2418		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2419		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2420		OUT_RINGp (chan, map, nr);
2421
2422		map += nr;
2423		start += nr;
2424		count -= nr;
2425	}
2426}
2427
2428static void
2429nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2430{
2431	struct pipe_screen *pscreen = nv50->pipe.screen;
2432
2433	if (!p->data[0] && p->immd_nr) {
2434		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2435
2436		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2437			while (heap->next && heap->size < p->immd_nr) {
2438				struct nv50_program *evict = heap->next->priv;
2439				nouveau_resource_free(&evict->data[0]);
2440			}
2441
2442			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2443						   &p->data[0]))
2444				assert(0);
2445		}
2446
2447		/* immediates only need to be uploaded again when freed */
2448		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2449					 p->immd_nr, NV50_CB_PMISC);
2450	}
2451
2452	if (!p->data[1] && p->param_nr) {
2453		struct nouveau_resource *heap =
2454			nv50->screen->parm_heap[p->type];
2455
2456		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
2457			while (heap->next && heap->size < p->param_nr) {
2458				struct nv50_program *evict = heap->next->priv;
2459				nouveau_resource_free(&evict->data[1]);
2460			}
2461
2462			if (nouveau_resource_alloc(heap, p->param_nr, p,
2463						   &p->data[1]))
2464				assert(0);
2465		}
2466	}
2467
2468	if (p->param_nr) {
2469		unsigned cbuf = NV50_CB_PVP;
2470		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2471					     PIPE_BUFFER_USAGE_CPU_READ);
2472		if (p->type == PIPE_SHADER_FRAGMENT)
2473			cbuf = NV50_CB_PFP;
2474		nv50_program_upload_data(nv50, map, p->data[1]->start,
2475					 p->param_nr, cbuf);
2476		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2477	}
2478}
2479
2480static void
2481nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2482{
2483	struct nouveau_channel *chan = nv50->screen->base.channel;
2484	struct nouveau_grobj *tesla = nv50->screen->tesla;
2485	struct nv50_program_exec *e;
2486	struct nouveau_stateobj *so;
2487	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2488	unsigned start, count, *up, *ptr;
2489	boolean upload = FALSE;
2490
2491	if (!p->bo) {
2492		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2493			       p->exec_size * 4, &p->bo);
2494		upload = TRUE;
2495	}
2496
2497	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2498		(p->data[1] && p->data[1]->start != p->data_start[1])) {
2499		for (e = p->exec_head; e; e = e->next) {
2500			unsigned ei, ci, bs;
2501
2502			if (e->param.index < 0)
2503				continue;
2504			bs = (e->inst[1] >> 22) & 0x07;
2505			assert(bs < 2);
2506			ei = e->param.shift >> 5;
2507			ci = e->param.index + p->data[bs]->start;
2508
2509			e->inst[ei] &= ~e->param.mask;
2510			e->inst[ei] |= (ci << e->param.shift);
2511		}
2512
2513		if (p->data[0])
2514			p->data_start[0] = p->data[0]->start;
2515		if (p->data[1])
2516			p->data_start[1] = p->data[1]->start;
2517
2518		upload = TRUE;
2519	}
2520
2521	if (!upload)
2522		return;
2523
2524#ifdef NV50_PROGRAM_DUMP
2525	NOUVEAU_ERR("-------\n");
2526	for (e = p->exec_head; e; e = e->next) {
2527		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2528		if (is_long(e))
2529			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2530	}
2531#endif
2532
2533	up = ptr = MALLOC(p->exec_size * 4);
2534	for (e = p->exec_head; e; e = e->next) {
2535		*(ptr++) = e->inst[0];
2536		if (is_long(e))
2537			*(ptr++) = e->inst[1];
2538	}
2539
2540	so = so_new(4,2);
2541	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2542	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2543	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2544	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2545
2546	start = 0; count = p->exec_size;
2547	while (count) {
2548		struct nouveau_channel *chan = nv50->screen->base.channel;
2549		unsigned nr;
2550
2551		so_emit(chan, so);
2552
2553		nr = MIN2(count, 2047);
2554		nr = MIN2(chan->pushbuf->remaining, nr);
2555		if (chan->pushbuf->remaining < (nr + 3)) {
2556			FIRE_RING(chan);
2557			continue;
2558		}
2559
2560		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2561		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2562		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2563		OUT_RINGp (chan, up + start, nr);
2564
2565		start += nr;
2566		count -= nr;
2567	}
2568
2569	FREE(up);
2570	so_ref(NULL, &so);
2571}
2572
2573void
2574nv50_vertprog_validate(struct nv50_context *nv50)
2575{
2576	struct nouveau_grobj *tesla = nv50->screen->tesla;
2577	struct nv50_program *p = nv50->vertprog;
2578	struct nouveau_stateobj *so;
2579
2580	if (!p->translated) {
2581		nv50_program_validate(nv50, p);
2582		if (!p->translated)
2583			assert(0);
2584	}
2585
2586	nv50_program_validate_data(nv50, p);
2587	nv50_program_validate_code(nv50, p);
2588
2589	so = so_new(13, 2);
2590	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2591	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2592		      NOUVEAU_BO_HIGH, 0, 0);
2593	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2594		      NOUVEAU_BO_LOW, 0, 0);
2595	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2596	so_data  (so, p->cfg.vp.attr[0]);
2597	so_data  (so, p->cfg.vp.attr[1]);
2598	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2599	so_data  (so, p->cfg.high_result);
2600	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2601	so_data  (so, p->cfg.high_result); //8);
2602	so_data  (so, p->cfg.high_temp);
2603	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2604	so_data  (so, 0); /* program start offset */
2605	so_ref(so, &nv50->state.vertprog);
2606	so_ref(NULL, &so);
2607}
2608
2609void
2610nv50_fragprog_validate(struct nv50_context *nv50)
2611{
2612	struct nouveau_grobj *tesla = nv50->screen->tesla;
2613	struct nv50_program *p = nv50->fragprog;
2614	struct nouveau_stateobj *so;
2615	unsigned i;
2616
2617	if (!p->translated) {
2618		nv50_program_validate(nv50, p);
2619		if (!p->translated)
2620			assert(0);
2621	}
2622
2623	nv50_program_validate_data(nv50, p);
2624	nv50_program_validate_code(nv50, p);
2625
2626	so = so_new(64, 2);
2627	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2628	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2629		      NOUVEAU_BO_HIGH, 0, 0);
2630	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2631		      NOUVEAU_BO_LOW, 0, 0);
2632	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2633	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2634	so_data  (so, 0x00000004);
2635	so_data  (so, 0x00000000);
2636	so_data  (so, 0x00000000);
2637	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
2638	for (i = 0; i < p->cfg.fp.high_map; i++)
2639		so_data(so, p->cfg.fp.map[i]);
2640	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
2641	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2642	so_data  (so, p->cfg.high_temp);
2643	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2644	so_data  (so, p->cfg.high_result);
2645	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2646	so_data  (so, p->cfg.fp.regs[2]);
2647	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2648	so_data  (so, p->cfg.fp.regs[3]);
2649	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2650	so_data  (so, 0); /* program start offset */
2651	so_ref(so, &nv50->state.fragprog);
2652	so_ref(NULL, &so);
2653}
2654
2655void
2656nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2657{
2658	while (p->exec_head) {
2659		struct nv50_program_exec *e = p->exec_head;
2660
2661		p->exec_head = e->next;
2662		FREE(e);
2663	}
2664	p->exec_tail = NULL;
2665	p->exec_size = 0;
2666
2667	nouveau_bo_ref(NULL, &p->bo);
2668
2669	nouveau_resource_free(&p->data[0]);
2670	nouveau_resource_free(&p->data[1]);
2671
2672	p->translated = 0;
2673}
2674