nv50_program.c revision f204eb184237b387432413212a3a20d83c87594b
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93/* arbitrary limits */
94#define MAX_IF_DEPTH 4
95#define MAX_LOOP_DEPTH 4
96
97struct nv50_pc {
98	struct nv50_program *p;
99
100	/* hw resources */
101	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
102
103	/* tgsi resources */
104	struct nv50_reg *temp;
105	int temp_nr;
106	struct nv50_reg *attr;
107	int attr_nr;
108	struct nv50_reg *result;
109	int result_nr;
110	struct nv50_reg *param;
111	int param_nr;
112	struct nv50_reg *immd;
113	float *immd_buf;
114	int immd_nr;
115
116	struct nv50_reg *temp_temp[16];
117	unsigned temp_temp_nr;
118
119	/* broadcast and destination replacement regs */
120	struct nv50_reg *r_brdc;
121	struct nv50_reg *r_dst[4];
122
123	unsigned interp_mode[32];
124	/* perspective interpolation registers */
125	struct nv50_reg *iv_p;
126	struct nv50_reg *iv_c;
127
128	struct nv50_program_exec *if_cond;
129	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
130	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
131	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
132	int if_lvl, loop_lvl;
133	unsigned loop_pos[MAX_LOOP_DEPTH];
134
135	/* current instruction and total number of insns */
136	unsigned insn_cur;
137	unsigned insn_nr;
138
139	boolean allow32;
140};
141
142static INLINE void
143ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
144{
145	reg->type = type;
146	reg->index = index;
147	reg->hw = hw;
148	reg->neg = 0;
149	reg->rhw = -1;
150	reg->acc = 0;
151}
152
153static INLINE unsigned
154popcnt4(uint32_t val)
155{
156	static const unsigned cnt[16]
157	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
158	return cnt[val & 0xf];
159}
160
161static void
162alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
163{
164	int i = 0;
165
166	if (reg->type == P_RESULT) {
167		if (pc->p->cfg.high_result < (reg->hw + 1))
168			pc->p->cfg.high_result = reg->hw + 1;
169	}
170
171	if (reg->type != P_TEMP)
172		return;
173
174	if (reg->hw >= 0) {
175		/*XXX: do this here too to catch FP temp-as-attr usage..
176		 *     not clean, but works */
177		if (pc->p->cfg.high_temp < (reg->hw + 1))
178			pc->p->cfg.high_temp = reg->hw + 1;
179		return;
180	}
181
182	if (reg->rhw != -1) {
183		/* try to allocate temporary with index rhw first */
184		if (!(pc->r_temp[reg->rhw])) {
185			pc->r_temp[reg->rhw] = reg;
186			reg->hw = reg->rhw;
187			if (pc->p->cfg.high_temp < (reg->rhw + 1))
188				pc->p->cfg.high_temp = reg->rhw + 1;
189			return;
190		}
191		/* make sure we don't get things like $r0 needs to go
192		 * in $r1 and $r1 in $r0
193		 */
194		i = pc->result_nr * 4;
195	}
196
197	for (; i < NV50_SU_MAX_TEMP; i++) {
198		if (!(pc->r_temp[i])) {
199			pc->r_temp[i] = reg;
200			reg->hw = i;
201			if (pc->p->cfg.high_temp < (i + 1))
202				pc->p->cfg.high_temp = i + 1;
203			return;
204		}
205	}
206
207	assert(0);
208}
209
210/* XXX: For shaders that aren't executed linearly (e.g. shaders that
211 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
212 * lest we risk temp_temps overwriting regs alloc'd "later".
213 */
214static struct nv50_reg *
215alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
216{
217	struct nv50_reg *r;
218	int i;
219
220	if (dst && dst->type == P_TEMP && dst->hw == -1)
221		return dst;
222
223	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
224		if (!pc->r_temp[i]) {
225			r = MALLOC_STRUCT(nv50_reg);
226			ctor_reg(r, P_TEMP, -1, i);
227			pc->r_temp[i] = r;
228			return r;
229		}
230	}
231
232	assert(0);
233	return NULL;
234}
235
236/* Assign the hw of the discarded temporary register src
237 * to the tgsi register dst and free src.
238 */
239static void
240assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
241{
242	assert(src->index == -1 && src->hw != -1);
243
244	if (dst->hw != -1)
245		pc->r_temp[dst->hw] = NULL;
246	pc->r_temp[src->hw] = dst;
247	dst->hw = src->hw;
248
249	FREE(src);
250}
251
252/* release the hardware resource held by r */
253static void
254release_hw(struct nv50_pc *pc, struct nv50_reg *r)
255{
256	assert(r->type == P_TEMP);
257	if (r->hw == -1)
258		return;
259
260	assert(pc->r_temp[r->hw] == r);
261	pc->r_temp[r->hw] = NULL;
262
263	r->acc = 0;
264	if (r->index == -1)
265		FREE(r);
266}
267
268static void
269free_temp(struct nv50_pc *pc, struct nv50_reg *r)
270{
271	if (r->index == -1) {
272		unsigned hw = r->hw;
273
274		FREE(pc->r_temp[hw]);
275		pc->r_temp[hw] = NULL;
276	}
277}
278
279static int
280alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
281{
282	int i;
283
284	if ((idx + 4) >= NV50_SU_MAX_TEMP)
285		return 1;
286
287	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
288	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
289		return alloc_temp4(pc, dst, idx + 4);
290
291	for (i = 0; i < 4; i++) {
292		dst[i] = MALLOC_STRUCT(nv50_reg);
293		ctor_reg(dst[i], P_TEMP, -1, idx + i);
294		pc->r_temp[idx + i] = dst[i];
295	}
296
297	return 0;
298}
299
300static void
301free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
302{
303	int i;
304
305	for (i = 0; i < 4; i++)
306		free_temp(pc, reg[i]);
307}
308
309static struct nv50_reg *
310temp_temp(struct nv50_pc *pc)
311{
312	if (pc->temp_temp_nr >= 16)
313		assert(0);
314
315	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
316	return pc->temp_temp[pc->temp_temp_nr++];
317}
318
319static void
320kill_temp_temp(struct nv50_pc *pc)
321{
322	int i;
323
324	for (i = 0; i < pc->temp_temp_nr; i++)
325		free_temp(pc, pc->temp_temp[i]);
326	pc->temp_temp_nr = 0;
327}
328
329static int
330ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
331{
332	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
333			       (pc->immd_nr + 1) * 4 * sizeof(float));
334	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
335	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
336	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
337	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
338
339	return pc->immd_nr++;
340}
341
342static struct nv50_reg *
343alloc_immd(struct nv50_pc *pc, float f)
344{
345	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
346	unsigned hw;
347
348	for (hw = 0; hw < pc->immd_nr * 4; hw++)
349		if (pc->immd_buf[hw] == f)
350			break;
351
352	if (hw == pc->immd_nr * 4)
353		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
354
355	ctor_reg(r, P_IMMD, -1, hw);
356	return r;
357}
358
359static struct nv50_program_exec *
360exec(struct nv50_pc *pc)
361{
362	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
363
364	e->param.index = -1;
365	return e;
366}
367
368static void
369emit(struct nv50_pc *pc, struct nv50_program_exec *e)
370{
371	struct nv50_program *p = pc->p;
372
373	if (p->exec_tail)
374		p->exec_tail->next = e;
375	if (!p->exec_head)
376		p->exec_head = e;
377	p->exec_tail = e;
378	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
379}
380
381static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
382
383static boolean
384is_long(struct nv50_program_exec *e)
385{
386	if (e->inst[0] & 1)
387		return TRUE;
388	return FALSE;
389}
390
391static boolean
392is_immd(struct nv50_program_exec *e)
393{
394	if (is_long(e) && (e->inst[1] & 3) == 3)
395		return TRUE;
396	return FALSE;
397}
398
399static INLINE void
400set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
401	 struct nv50_program_exec *e)
402{
403	set_long(pc, e);
404	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
405	e->inst[1] |= (pred << 7) | (idx << 12);
406}
407
408static INLINE void
409set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
410	    struct nv50_program_exec *e)
411{
412	set_long(pc, e);
413	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
414	e->inst[1] |= (idx << 4) | (on << 6);
415}
416
417static INLINE void
418set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
419{
420	if (is_long(e))
421		return;
422
423	e->inst[0] |= 1;
424	set_pred(pc, 0xf, 0, e);
425	set_pred_wr(pc, 0, 0, e);
426}
427
428static INLINE void
429set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
430{
431	if (dst->type == P_RESULT) {
432		set_long(pc, e);
433		e->inst[1] |= 0x00000008;
434	}
435
436	alloc_reg(pc, dst);
437	e->inst[0] |= (dst->hw << 2);
438}
439
440static INLINE void
441set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
442{
443	float f = pc->immd_buf[imm->hw];
444	unsigned val = fui(imm->neg ? -f : f);
445
446	set_long(pc, e);
447	/*XXX: can't be predicated - bits overlap.. catch cases where both
448	 *     are required and avoid them. */
449	set_pred(pc, 0, 0, e);
450	set_pred_wr(pc, 0, 0, e);
451
452	e->inst[1] |= 0x00000002 | 0x00000001;
453	e->inst[0] |= (val & 0x3f) << 16;
454	e->inst[1] |= (val >> 6) << 2;
455}
456
457
458#define INTERP_LINEAR		0
459#define INTERP_FLAT			1
460#define INTERP_PERSPECTIVE	2
461#define INTERP_CENTROID		4
462
463/* interpolant index has been stored in dst->rhw */
464static void
465emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
466		unsigned mode)
467{
468	assert(dst->rhw != -1);
469	struct nv50_program_exec *e = exec(pc);
470
471	e->inst[0] |= 0x80000000;
472	set_dst(pc, dst, e);
473	e->inst[0] |= (dst->rhw << 16);
474
475	if (mode & INTERP_FLAT) {
476		e->inst[0] |= (1 << 8);
477	} else {
478		if (mode & INTERP_PERSPECTIVE) {
479			e->inst[0] |= (1 << 25);
480			alloc_reg(pc, iv);
481			e->inst[0] |= (iv->hw << 9);
482		}
483
484		if (mode & INTERP_CENTROID)
485			e->inst[0] |= (1 << 24);
486	}
487
488	emit(pc, e);
489}
490
491static void
492set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
493	 struct nv50_program_exec *e)
494{
495	set_long(pc, e);
496
497	e->param.index = src->hw;
498	e->param.shift = s;
499	e->param.mask = m << (s % 32);
500
501	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
502}
503
504static void
505emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
506{
507	struct nv50_program_exec *e = exec(pc);
508
509	e->inst[0] |= 0x10000000;
510
511	set_dst(pc, dst, e);
512
513	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
514		set_immd(pc, src, e);
515		/*XXX: 32-bit, but steals part of "half" reg space - need to
516		 *     catch and handle this case if/when we do half-regs
517		 */
518	} else
519	if (src->type == P_IMMD || src->type == P_CONST) {
520		set_long(pc, e);
521		set_data(pc, src, 0x7f, 9, e);
522		e->inst[1] |= 0x20000000; /* src0 const? */
523	} else {
524		if (src->type == P_ATTR) {
525			set_long(pc, e);
526			e->inst[1] |= 0x00200000;
527		}
528
529		alloc_reg(pc, src);
530		e->inst[0] |= (src->hw << 9);
531	}
532
533	if (is_long(e) && !is_immd(e)) {
534		e->inst[1] |= 0x04000000; /* 32-bit */
535		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
536		if (!(e->inst[1] & 0x20000000))
537			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
538	} else
539		e->inst[0] |= 0x00008000;
540
541	emit(pc, e);
542}
543
544static INLINE void
545emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
546{
547	struct nv50_reg *imm = alloc_immd(pc, f);
548	emit_mov(pc, dst, imm);
549	FREE(imm);
550}
551
552static boolean
553check_swap_src_0_1(struct nv50_pc *pc,
554		   struct nv50_reg **s0, struct nv50_reg **s1)
555{
556	struct nv50_reg *src0 = *s0, *src1 = *s1;
557
558	if (src0->type == P_CONST) {
559		if (src1->type != P_CONST) {
560			*s0 = src1;
561			*s1 = src0;
562			return TRUE;
563		}
564	} else
565	if (src1->type == P_ATTR) {
566		if (src0->type != P_ATTR) {
567			*s0 = src1;
568			*s1 = src0;
569			return TRUE;
570		}
571	}
572
573	return FALSE;
574}
575
576static void
577set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
578		     struct nv50_program_exec *e)
579{
580	struct nv50_reg *temp;
581
582	if (src->type != P_TEMP) {
583		temp = temp_temp(pc);
584		emit_mov(pc, temp, src);
585		src = temp;
586	}
587
588	alloc_reg(pc, src);
589	e->inst[0] |= (src->hw << 9);
590}
591
592static void
593set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
594{
595	if (src->type == P_ATTR) {
596		set_long(pc, e);
597		e->inst[1] |= 0x00200000;
598	} else
599	if (src->type == P_CONST || src->type == P_IMMD) {
600		struct nv50_reg *temp = temp_temp(pc);
601
602		emit_mov(pc, temp, src);
603		src = temp;
604	}
605
606	alloc_reg(pc, src);
607	e->inst[0] |= (src->hw << 9);
608}
609
610static void
611set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
612{
613	if (src->type == P_ATTR) {
614		struct nv50_reg *temp = temp_temp(pc);
615
616		emit_mov(pc, temp, src);
617		src = temp;
618	} else
619	if (src->type == P_CONST || src->type == P_IMMD) {
620		assert(!(e->inst[0] & 0x00800000));
621		if (e->inst[0] & 0x01000000) {
622			struct nv50_reg *temp = temp_temp(pc);
623
624			emit_mov(pc, temp, src);
625			src = temp;
626		} else {
627			set_data(pc, src, 0x7f, 16, e);
628			e->inst[0] |= 0x00800000;
629		}
630	}
631
632	alloc_reg(pc, src);
633	e->inst[0] |= (src->hw << 16);
634}
635
636static void
637set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
638{
639	set_long(pc, e);
640
641	if (src->type == P_ATTR) {
642		struct nv50_reg *temp = temp_temp(pc);
643
644		emit_mov(pc, temp, src);
645		src = temp;
646	} else
647	if (src->type == P_CONST || src->type == P_IMMD) {
648		assert(!(e->inst[0] & 0x01000000));
649		if (e->inst[0] & 0x00800000) {
650			struct nv50_reg *temp = temp_temp(pc);
651
652			emit_mov(pc, temp, src);
653			src = temp;
654		} else {
655			set_data(pc, src, 0x7f, 32+14, e);
656			e->inst[0] |= 0x01000000;
657		}
658	}
659
660	alloc_reg(pc, src);
661	e->inst[1] |= (src->hw << 14);
662}
663
664static void
665emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
666	 struct nv50_reg *src1)
667{
668	struct nv50_program_exec *e = exec(pc);
669
670	e->inst[0] |= 0xc0000000;
671
672	if (!pc->allow32)
673		set_long(pc, e);
674
675	check_swap_src_0_1(pc, &src0, &src1);
676	set_dst(pc, dst, e);
677	set_src_0(pc, src0, e);
678	if (src1->type == P_IMMD && !is_long(e)) {
679		if (src0->neg)
680			e->inst[0] |= 0x00008000;
681		set_immd(pc, src1, e);
682	} else {
683		set_src_1(pc, src1, e);
684		if (src0->neg ^ src1->neg) {
685			if (is_long(e))
686				e->inst[1] |= 0x08000000;
687			else
688				e->inst[0] |= 0x00008000;
689		}
690	}
691
692	emit(pc, e);
693}
694
695static void
696emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
697	 struct nv50_reg *src0, struct nv50_reg *src1)
698{
699	struct nv50_program_exec *e = exec(pc);
700
701	e->inst[0] |= 0xb0000000;
702
703	check_swap_src_0_1(pc, &src0, &src1);
704
705	if (!pc->allow32 || src0->neg || src1->neg) {
706		set_long(pc, e);
707		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
708	}
709
710	set_dst(pc, dst, e);
711	set_src_0(pc, src0, e);
712	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
713		set_src_2(pc, src1, e);
714	else
715	if (src1->type == P_IMMD)
716		set_immd(pc, src1, e);
717	else
718		set_src_1(pc, src1, e);
719
720	emit(pc, e);
721}
722
723static void
724emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
725	    struct nv50_reg *src0, struct nv50_reg *src1)
726{
727	struct nv50_program_exec *e = exec(pc);
728
729	set_long(pc, e);
730	e->inst[0] |= 0xb0000000;
731	e->inst[1] |= (sub << 29);
732
733	check_swap_src_0_1(pc, &src0, &src1);
734	set_dst(pc, dst, e);
735	set_src_0(pc, src0, e);
736	set_src_1(pc, src1, e);
737
738	emit(pc, e);
739}
740
741static INLINE void
742emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
743	 struct nv50_reg *src1)
744{
745	src1->neg ^= 1;
746	emit_add(pc, dst, src0, src1);
747	src1->neg ^= 1;
748}
749
750static void
751emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
752	 struct nv50_reg *src1, struct nv50_reg *src2)
753{
754	struct nv50_program_exec *e = exec(pc);
755
756	e->inst[0] |= 0xe0000000;
757
758	check_swap_src_0_1(pc, &src0, &src1);
759	set_dst(pc, dst, e);
760	set_src_0(pc, src0, e);
761	set_src_1(pc, src1, e);
762	set_src_2(pc, src2, e);
763
764	if (src0->neg ^ src1->neg)
765		e->inst[1] |= 0x04000000;
766	if (src2->neg)
767		e->inst[1] |= 0x08000000;
768
769	emit(pc, e);
770}
771
772static INLINE void
773emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
774	 struct nv50_reg *src1, struct nv50_reg *src2)
775{
776	src2->neg ^= 1;
777	emit_mad(pc, dst, src0, src1, src2);
778	src2->neg ^= 1;
779}
780
781static void
782emit_flop(struct nv50_pc *pc, unsigned sub,
783	  struct nv50_reg *dst, struct nv50_reg *src)
784{
785	struct nv50_program_exec *e = exec(pc);
786
787	e->inst[0] |= 0x90000000;
788	if (sub) {
789		set_long(pc, e);
790		e->inst[1] |= (sub << 29);
791	}
792
793	set_dst(pc, dst, e);
794
795	if (sub == 0 || sub == 2)
796		set_src_0_restricted(pc, src, e);
797	else
798		set_src_0(pc, src, e);
799
800	emit(pc, e);
801}
802
803static void
804emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
805{
806	struct nv50_program_exec *e = exec(pc);
807
808	e->inst[0] |= 0xb0000000;
809
810	set_dst(pc, dst, e);
811	set_src_0(pc, src, e);
812	set_long(pc, e);
813	e->inst[1] |= (6 << 29) | 0x00004000;
814
815	emit(pc, e);
816}
817
818static void
819emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
820{
821	struct nv50_program_exec *e = exec(pc);
822
823	e->inst[0] |= 0xb0000000;
824
825	set_dst(pc, dst, e);
826	set_src_0(pc, src, e);
827	set_long(pc, e);
828	e->inst[1] |= (6 << 29);
829
830	emit(pc, e);
831}
832
833#define CVTOP_RN	0x01
834#define CVTOP_FLOOR	0x03
835#define CVTOP_CEIL	0x05
836#define CVTOP_TRUNC	0x07
837#define CVTOP_SAT	0x08
838#define CVTOP_ABS	0x10
839
840/* 0x04 == 32 bit dst */
841/* 0x40 == dst is float */
842/* 0x80 == src is float */
843#define CVT_F32_F32 0xc4
844#define CVT_F32_S32 0x44
845#define CVT_F32_U32 0x64
846#define CVT_S32_F32 0x8c
847#define CVT_S32_S32 0x0c
848#define CVT_NEG     0x20
849#define CVT_RI      0x08
850
851static void
852emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
853	 int wp, unsigned cvn, unsigned fmt)
854{
855	struct nv50_program_exec *e;
856
857	e = exec(pc);
858	set_long(pc, e);
859
860	e->inst[0] |= 0xa0000000;
861	e->inst[1] |= 0x00004000; /* 32 bit src */
862	e->inst[1] |= (cvn << 16);
863	e->inst[1] |= (fmt << 24);
864	set_src_0(pc, src, e);
865
866	if (wp >= 0)
867		set_pred_wr(pc, 1, wp, e);
868
869	if (dst)
870		set_dst(pc, dst, e);
871	else {
872		e->inst[0] |= 0x000001fc;
873		e->inst[1] |= 0x00000008;
874	}
875
876	emit(pc, e);
877}
878
879/* nv50 Condition codes:
880 *  0x1 = LT
881 *  0x2 = EQ
882 *  0x3 = LE
883 *  0x4 = GT
884 *  0x5 = NE
885 *  0x6 = GE
886 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
887 *  0x8 = unordered bit (allows NaN)
888 */
889static void
890emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
891	 struct nv50_reg *src0, struct nv50_reg *src1)
892{
893	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
894
895	struct nv50_program_exec *e = exec(pc);
896	struct nv50_reg *rdst;
897
898	assert(ccode < 16);
899	if (check_swap_src_0_1(pc, &src0, &src1))
900		ccode = cc_swapped[ccode & 7] | (ccode & 8);
901
902	rdst = dst;
903	if (dst && dst->type != P_TEMP)
904		dst = alloc_temp(pc, NULL);
905
906	/* set.u32 */
907	set_long(pc, e);
908	e->inst[0] |= 0xb0000000;
909	e->inst[1] |= 0x60000000 | (ccode << 14);
910
911	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
912	 * that doesn't seem to match what the hw actually does
913	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
914	 */
915
916	if (wp >= 0)
917		set_pred_wr(pc, 1, wp, e);
918	if (dst)
919		set_dst(pc, dst, e);
920	else {
921		e->inst[0] |= 0x000001fc;
922		e->inst[1] |= 0x00000008;
923	}
924
925	set_src_0(pc, src0, e);
926	set_src_1(pc, src1, e);
927
928	emit(pc, e);
929	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
930
931	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
932	if (rdst)
933		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
934	if (rdst && rdst != dst)
935		free_temp(pc, dst);
936}
937
938static INLINE unsigned
939map_tgsi_setop_cc(unsigned op)
940{
941	switch (op) {
942	case TGSI_OPCODE_SLT: return 0x1;
943	case TGSI_OPCODE_SGE: return 0x6;
944	case TGSI_OPCODE_SEQ: return 0x2;
945	case TGSI_OPCODE_SGT: return 0x4;
946	case TGSI_OPCODE_SLE: return 0x3;
947	case TGSI_OPCODE_SNE: return 0xd;
948	default:
949		assert(0);
950		return 0;
951	}
952}
953
954static INLINE void
955emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
956{
957	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
958}
959
960static void
961emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
962	 struct nv50_reg *v, struct nv50_reg *e)
963{
964	struct nv50_reg *temp = alloc_temp(pc, NULL);
965
966	emit_flop(pc, 3, temp, v);
967	emit_mul(pc, temp, temp, e);
968	emit_preex2(pc, temp, temp);
969	emit_flop(pc, 6, dst, temp);
970
971	free_temp(pc, temp);
972}
973
974static INLINE void
975emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
976{
977	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
978}
979
980static INLINE void
981emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
982{
983	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
984}
985
986static void
987emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
988	 struct nv50_reg **src)
989{
990	struct nv50_reg *one = alloc_immd(pc, 1.0);
991	struct nv50_reg *zero = alloc_immd(pc, 0.0);
992	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
993	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
994	struct nv50_reg *tmp[4];
995	boolean allow32 = pc->allow32;
996
997	pc->allow32 = FALSE;
998
999	if (mask & (3 << 1)) {
1000		tmp[0] = alloc_temp(pc, NULL);
1001		emit_minmax(pc, 4, tmp[0], src[0], zero);
1002	}
1003
1004	if (mask & (1 << 2)) {
1005		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1006
1007		tmp[1] = temp_temp(pc);
1008		emit_minmax(pc, 4, tmp[1], src[1], zero);
1009
1010		tmp[3] = temp_temp(pc);
1011		emit_minmax(pc, 4, tmp[3], src[3], neg128);
1012		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1013
1014		emit_pow(pc, dst[2], tmp[1], tmp[3]);
1015		emit_mov(pc, dst[2], zero);
1016		set_pred(pc, 3, 0, pc->p->exec_tail);
1017	}
1018
1019	if (mask & (1 << 1))
1020		assimilate_temp(pc, dst[1], tmp[0]);
1021	else
1022	if (mask & (1 << 2))
1023		free_temp(pc, tmp[0]);
1024
1025	pc->allow32 = allow32;
1026
1027	/* do this last, in case src[i,j] == dst[0,3] */
1028	if (mask & (1 << 0))
1029		emit_mov(pc, dst[0], one);
1030
1031	if (mask & (1 << 3))
1032		emit_mov(pc, dst[3], one);
1033
1034	FREE(pos128);
1035	FREE(neg128);
1036	FREE(zero);
1037	FREE(one);
1038}
1039
1040static INLINE void
1041emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1042{
1043	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1044}
1045
1046static void
1047emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1048{
1049	struct nv50_program_exec *e;
1050	const int r_pred = 1;
1051
1052	/* Sets predicate reg ? */
1053	e = exec(pc);
1054	e->inst[0] = 0xa00001fd;
1055	e->inst[1] = 0xc4014788;
1056	set_src_0(pc, src, e);
1057	set_pred_wr(pc, 1, r_pred, e);
1058	if (src->neg)
1059		e->inst[1] |= 0x20000000;
1060	emit(pc, e);
1061
1062	/* This is probably KILP */
1063	e = exec(pc);
1064	e->inst[0] = 0x000001fe;
1065	set_long(pc, e);
1066	set_pred(pc, 1 /* LT? */, r_pred, e);
1067	emit(pc, e);
1068}
1069
1070static void
1071emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1072	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1073{
1074	struct nv50_reg *temp, *t[4];
1075	struct nv50_program_exec *e;
1076
1077	unsigned c, mode, dim;
1078
1079	switch (type) {
1080	case TGSI_TEXTURE_1D:
1081		dim = 1;
1082		break;
1083	case TGSI_TEXTURE_UNKNOWN:
1084	case TGSI_TEXTURE_2D:
1085	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1086	case TGSI_TEXTURE_RECT:
1087		dim = 2;
1088		break;
1089	case TGSI_TEXTURE_3D:
1090	case TGSI_TEXTURE_CUBE:
1091	case TGSI_TEXTURE_SHADOW2D:
1092	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1093		dim = 3;
1094		break;
1095	default:
1096		assert(0);
1097		break;
1098	}
1099
1100	/* some cards need t[0]'s hw index to be a multiple of 4 */
1101	alloc_temp4(pc, t, 0);
1102
1103	if (proj) {
1104		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1105			mode = pc->interp_mode[src[0]->index];
1106
1107			t[3]->rhw = src[3]->rhw;
1108			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1109			emit_flop(pc, 0, t[3], t[3]);
1110
1111			for (c = 0; c < dim; c++) {
1112				t[c]->rhw = src[c]->rhw;
1113				emit_interp(pc, t[c], t[3],
1114					    (mode | INTERP_PERSPECTIVE));
1115			}
1116		} else {
1117			emit_flop(pc, 0, t[3], src[3]);
1118			for (c = 0; c < dim; c++)
1119				emit_mul(pc, t[c], src[c], t[3]);
1120
1121			/* XXX: for some reason the blob sometimes uses MAD:
1122			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1123			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1124			 */
1125		}
1126	} else {
1127		if (type == TGSI_TEXTURE_CUBE) {
1128			temp = temp_temp(pc);
1129			emit_minmax(pc, 4, temp, src[0], src[1]);
1130			emit_minmax(pc, 4, temp, temp, src[2]);
1131			emit_flop(pc, 0, temp, temp);
1132			for (c = 0; c < 3; c++)
1133				emit_mul(pc, t[c], src[c], temp);
1134		} else {
1135			for (c = 0; c < dim; c++)
1136				emit_mov(pc, t[c], src[c]);
1137		}
1138	}
1139
1140	e = exec(pc);
1141	set_long(pc, e);
1142	e->inst[0] |= 0xf0000000;
1143	e->inst[1] |= 0x00000004;
1144	set_dst(pc, t[0], e);
1145	e->inst[0] |= (unit << 9);
1146
1147	if (dim == 2)
1148		e->inst[0] |= 0x00400000;
1149	else
1150	if (dim == 3)
1151		e->inst[0] |= 0x00800000;
1152
1153	e->inst[0] |= (mask & 0x3) << 25;
1154	e->inst[1] |= (mask & 0xc) << 12;
1155
1156	emit(pc, e);
1157
1158#if 1
1159	c = 0;
1160	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1161	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1162	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1163	if (mask & 8) emit_mov(pc, dst[3], t[c]);
1164
1165	free_temp4(pc, t);
1166#else
1167	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1168	 * the texture coordinates, not the fetched values: latency ? */
1169
1170	for (c = 0; c < 4; c++) {
1171		if (mask & (1 << c))
1172			assimilate_temp(pc, dst[c], t[c]);
1173		else
1174			free_temp(pc, t[c]);
1175	}
1176#endif
1177}
1178
1179static void
1180emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1181	    struct nv50_program_exec **join)
1182{
1183	struct nv50_program_exec *e = exec(pc);
1184
1185	if (join) {
1186		set_long(pc, e);
1187		e->inst[0] |= 0xa0000002;
1188		emit(pc, e);
1189		*join = e;
1190		e = exec(pc);
1191	}
1192
1193	set_long(pc, e);
1194	e->inst[0] |= 0x10000002;
1195	if (pred >= 0)
1196		set_pred(pc, cc, pred, e);
1197	emit(pc, e);
1198}
1199
1200static void
1201emit_nop(struct nv50_pc *pc)
1202{
1203	struct nv50_program_exec *e = exec(pc);
1204
1205	e->inst[0] = 0xf0000000;
1206	set_long(pc, e);
1207	e->inst[1] = 0xe0000000;
1208	emit(pc, e);
1209}
1210
1211static void
1212emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1213{
1214	struct nv50_program_exec *e = exec(pc);
1215
1216	assert(src->type == P_TEMP);
1217
1218	e->inst[0] = 0xc0140000;
1219	e->inst[1] = 0x89800000;
1220	set_long(pc, e);
1221	set_dst(pc, dst, e);
1222	set_src_0(pc, src, e);
1223	set_src_2(pc, src, e);
1224
1225	emit(pc, e);
1226}
1227
1228static void
1229emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1230{
1231	struct nv50_program_exec *e = exec(pc);
1232
1233	assert(src->type == P_TEMP);
1234
1235	if (!src->neg) /* ! double negation */
1236		emit_neg(pc, src, src);
1237
1238	e->inst[0] = 0xc0150000;
1239	e->inst[1] = 0x8a400000;
1240	set_long(pc, e);
1241	set_dst(pc, dst, e);
1242	set_src_0(pc, src, e);
1243	set_src_2(pc, src, e);
1244
1245	emit(pc, e);
1246}
1247
1248static void
1249convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1250{
1251	unsigned q = 0, m = ~0;
1252
1253	assert(!is_long(e));
1254
1255	switch (e->inst[0] >> 28) {
1256	case 0x1:
1257		/* MOV */
1258		q = 0x0403c000;
1259		m = 0xffff7fff;
1260		break;
1261	case 0x8:
1262		/* INTERP (move centroid, perspective and flat bits) */
1263		m = ~0x03000100;
1264		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1265		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1266		break;
1267	case 0x9:
1268		/* RCP */
1269		break;
1270	case 0xB:
1271		/* ADD */
1272		m = ~(127 << 16);
1273		q = ((e->inst[0] & (~m)) >> 2);
1274		break;
1275	case 0xC:
1276		/* MUL */
1277		m = ~0x00008000;
1278		q = ((e->inst[0] & (~m)) << 12);
1279		break;
1280	case 0xE:
1281		/* MAD (if src2 == dst) */
1282		q = ((e->inst[0] & 0x1fc) << 12);
1283		break;
1284	default:
1285		assert(0);
1286		break;
1287	}
1288
1289	set_long(pc, e);
1290	pc->p->exec_size++;
1291
1292	e->inst[0] &= m;
1293	e->inst[1] |= q;
1294}
1295
1296/* Some operations support an optional negation flag. */
1297static boolean
1298negate_supported(const struct tgsi_full_instruction *insn, int i)
1299{
1300	int s;
1301
1302	switch (insn->Instruction.Opcode) {
1303	case TGSI_OPCODE_DDY:
1304	case TGSI_OPCODE_DP3:
1305	case TGSI_OPCODE_DP4:
1306	case TGSI_OPCODE_MUL:
1307	case TGSI_OPCODE_KIL:
1308	case TGSI_OPCODE_ADD:
1309	case TGSI_OPCODE_SUB:
1310	case TGSI_OPCODE_MAD:
1311		break;
1312	case TGSI_OPCODE_POW:
1313		if (i == 1)
1314			break;
1315		return FALSE;
1316	default:
1317		return FALSE;
1318	}
1319
1320	/* Watch out for possible multiple uses of an nv50_reg, we
1321	 * can't use nv50_reg::neg in these cases.
1322	 */
1323	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
1324		if (s == i)
1325			continue;
1326		if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
1327		     insn->FullSrcRegisters[i].SrcRegister.Index) &&
1328		    (insn->FullSrcRegisters[s].SrcRegister.File ==
1329		     insn->FullSrcRegisters[i].SrcRegister.File))
1330			return FALSE;
1331	}
1332
1333	return TRUE;
1334}
1335
1336/* Return a read mask for source registers deduced from opcode & write mask. */
1337static unsigned
1338nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1339{
1340	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1341
1342	switch (insn->Instruction.Opcode) {
1343	case TGSI_OPCODE_COS:
1344	case TGSI_OPCODE_SIN:
1345		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1346	case TGSI_OPCODE_DP3:
1347		return 0x7;
1348	case TGSI_OPCODE_DP4:
1349	case TGSI_OPCODE_DPH:
1350	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1351		return 0xf;
1352	case TGSI_OPCODE_DST:
1353		return mask & (c ? 0xa : 0x6);
1354	case TGSI_OPCODE_EX2:
1355	case TGSI_OPCODE_LG2:
1356	case TGSI_OPCODE_POW:
1357	case TGSI_OPCODE_RCP:
1358	case TGSI_OPCODE_RSQ:
1359	case TGSI_OPCODE_SCS:
1360		return 0x1;
1361	case TGSI_OPCODE_LIT:
1362		return 0xb;
1363	case TGSI_OPCODE_TEX:
1364	case TGSI_OPCODE_TXP:
1365	{
1366		const struct tgsi_instruction_ext_texture *tex;
1367
1368		assert(insn->Instruction.Extended);
1369		tex = &insn->InstructionExtTexture;
1370
1371		mask = 0x7;
1372		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1373			mask |= 0x8;
1374
1375		switch (tex->Texture) {
1376		case TGSI_TEXTURE_1D:
1377			mask &= 0x9;
1378			break;
1379		case TGSI_TEXTURE_2D:
1380			mask &= 0xb;
1381			break;
1382		default:
1383			break;
1384		}
1385	}
1386		return mask;
1387	case TGSI_OPCODE_XPD:
1388		x = 0;
1389		if (mask & 1) x |= 0x6;
1390		if (mask & 2) x |= 0x5;
1391		if (mask & 4) x |= 0x3;
1392		return x;
1393	default:
1394		break;
1395	}
1396
1397	return mask;
1398}
1399
1400static struct nv50_reg *
1401tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1402{
1403	switch (dst->DstRegister.File) {
1404	case TGSI_FILE_TEMPORARY:
1405		return &pc->temp[dst->DstRegister.Index * 4 + c];
1406	case TGSI_FILE_OUTPUT:
1407		return &pc->result[dst->DstRegister.Index * 4 + c];
1408	case TGSI_FILE_NULL:
1409		return NULL;
1410	default:
1411		break;
1412	}
1413
1414	return NULL;
1415}
1416
1417static struct nv50_reg *
1418tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1419	 boolean neg)
1420{
1421	struct nv50_reg *r = NULL;
1422	struct nv50_reg *temp;
1423	unsigned sgn, c;
1424
1425	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1426
1427	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1428	switch (c) {
1429	case TGSI_EXTSWIZZLE_X:
1430	case TGSI_EXTSWIZZLE_Y:
1431	case TGSI_EXTSWIZZLE_Z:
1432	case TGSI_EXTSWIZZLE_W:
1433		switch (src->SrcRegister.File) {
1434		case TGSI_FILE_INPUT:
1435			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1436			break;
1437		case TGSI_FILE_TEMPORARY:
1438			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1439			break;
1440		case TGSI_FILE_CONSTANT:
1441			r = &pc->param[src->SrcRegister.Index * 4 + c];
1442			break;
1443		case TGSI_FILE_IMMEDIATE:
1444			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1445			break;
1446		case TGSI_FILE_SAMPLER:
1447			break;
1448		default:
1449			assert(0);
1450			break;
1451		}
1452		break;
1453	case TGSI_EXTSWIZZLE_ZERO:
1454		r = alloc_immd(pc, 0.0);
1455		return r;
1456	case TGSI_EXTSWIZZLE_ONE:
1457		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1458			return alloc_immd(pc, -1.0);
1459		return alloc_immd(pc, 1.0);
1460	default:
1461		assert(0);
1462		break;
1463	}
1464
1465	switch (sgn) {
1466	case TGSI_UTIL_SIGN_KEEP:
1467		break;
1468	case TGSI_UTIL_SIGN_CLEAR:
1469		temp = temp_temp(pc);
1470		emit_abs(pc, temp, r);
1471		r = temp;
1472		break;
1473	case TGSI_UTIL_SIGN_TOGGLE:
1474		if (neg)
1475			r->neg = 1;
1476		else {
1477			temp = temp_temp(pc);
1478			emit_neg(pc, temp, r);
1479			r = temp;
1480		}
1481		break;
1482	case TGSI_UTIL_SIGN_SET:
1483		temp = temp_temp(pc);
1484		emit_abs(pc, temp, r);
1485		if (neg)
1486			temp->neg = 1;
1487		else
1488			emit_neg(pc, temp, temp);
1489		r = temp;
1490		break;
1491	default:
1492		assert(0);
1493		break;
1494	}
1495
1496	return r;
1497}
1498
1499/* return TRUE for ops that produce only a single result */
1500static boolean
1501is_scalar_op(unsigned op)
1502{
1503	switch (op) {
1504	case TGSI_OPCODE_COS:
1505	case TGSI_OPCODE_DP2:
1506	case TGSI_OPCODE_DP3:
1507	case TGSI_OPCODE_DP4:
1508	case TGSI_OPCODE_DPH:
1509	case TGSI_OPCODE_EX2:
1510	case TGSI_OPCODE_LG2:
1511	case TGSI_OPCODE_POW:
1512	case TGSI_OPCODE_RCP:
1513	case TGSI_OPCODE_RSQ:
1514	case TGSI_OPCODE_SIN:
1515		/*
1516	case TGSI_OPCODE_KIL:
1517	case TGSI_OPCODE_LIT:
1518	case TGSI_OPCODE_SCS:
1519		*/
1520		return TRUE;
1521	default:
1522		return FALSE;
1523	}
1524}
1525
1526/* Returns a bitmask indicating which dst components depend
1527 * on source s, component c (reverse of nv50_tgsi_src_mask).
1528 */
1529static unsigned
1530nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1531{
1532	if (is_scalar_op(op))
1533		return 0x1;
1534
1535	switch (op) {
1536	case TGSI_OPCODE_DST:
1537		return (1 << c) & (s ? 0xa : 0x6);
1538	case TGSI_OPCODE_XPD:
1539		switch (c) {
1540		case 0: return 0x6;
1541		case 1: return 0x5;
1542		case 2: return 0x3;
1543		case 3: return 0x0;
1544		default:
1545			assert(0);
1546			return 0x0;
1547		}
1548	case TGSI_OPCODE_LIT:
1549	case TGSI_OPCODE_SCS:
1550	case TGSI_OPCODE_TEX:
1551	case TGSI_OPCODE_TXP:
1552		/* these take care of dangerous swizzles themselves */
1553		return 0x0;
1554	case TGSI_OPCODE_IF:
1555	case TGSI_OPCODE_KIL:
1556		/* don't call this function for these ops */
1557		assert(0);
1558		return 0;
1559	default:
1560		/* linear vector instruction */
1561		return (1 << c);
1562	}
1563}
1564
1565static INLINE boolean
1566has_pred(struct nv50_program_exec *e, unsigned cc)
1567{
1568	if (!is_long(e) || is_immd(e))
1569		return FALSE;
1570	return ((e->inst[1] & 0x780) == (cc << 7));
1571}
1572
1573/* on ENDIF see if we can do "@p0.neu single_op" instead of:
1574 *        join_at ENDIF
1575 *        @p0.eq bra ENDIF
1576 *        single_op
1577 * ENDIF: nop.join
1578 */
1579static boolean
1580nv50_kill_branch(struct nv50_pc *pc)
1581{
1582	int lvl = pc->if_lvl;
1583
1584	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
1585		return FALSE;
1586
1587	/* if ccode == 'true', the BRA is from an ELSE and the predicate
1588	 * reg may no longer be valid, since we currently always use $p0
1589	 */
1590	if (has_pred(pc->if_insn[lvl], 0xf))
1591		return FALSE;
1592	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
1593
1594	/* We'll use the exec allocated for JOIN_AT (as we can't easily
1595	 * update prev's next); if exec_tail is BRK, update the pointer.
1596	 */
1597	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
1598		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
1599
1600	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
1601
1602	*pc->br_join[lvl] = *pc->p->exec_tail;
1603
1604	FREE(pc->if_insn[lvl]);
1605	FREE(pc->p->exec_tail);
1606
1607	pc->p->exec_tail = pc->br_join[lvl];
1608	pc->p->exec_tail->next = NULL;
1609	set_pred(pc, 0xd, 0, pc->p->exec_tail);
1610
1611	return TRUE;
1612}
1613
1614static boolean
1615nv50_program_tx_insn(struct nv50_pc *pc,
1616		     const struct tgsi_full_instruction *inst)
1617{
1618	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1619	unsigned mask, sat, unit;
1620	int i, c;
1621
1622	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1623	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1624
1625	memset(src, 0, sizeof(src));
1626
1627	for (c = 0; c < 4; c++) {
1628		if ((mask & (1 << c)) && !pc->r_dst[c])
1629			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1630		else
1631			dst[c] = pc->r_dst[c];
1632		rdst[c] = dst[c];
1633	}
1634
1635	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1636		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1637		unsigned src_mask;
1638		boolean neg_supp;
1639
1640		src_mask = nv50_tgsi_src_mask(inst, i);
1641		neg_supp = negate_supported(inst, i);
1642
1643		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1644			unit = fs->SrcRegister.Index;
1645
1646		for (c = 0; c < 4; c++)
1647			if (src_mask & (1 << c))
1648				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1649	}
1650
1651	brdc = temp = pc->r_brdc;
1652	if (brdc && brdc->type != P_TEMP) {
1653		temp = temp_temp(pc);
1654		if (sat)
1655			brdc = temp;
1656	} else
1657	if (sat) {
1658		for (c = 0; c < 4; c++) {
1659			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1660				continue;
1661			rdst[c] = dst[c];
1662			dst[c] = temp_temp(pc);
1663		}
1664	}
1665
1666	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1667
1668	switch (inst->Instruction.Opcode) {
1669	case TGSI_OPCODE_ABS:
1670		for (c = 0; c < 4; c++) {
1671			if (!(mask & (1 << c)))
1672				continue;
1673			emit_abs(pc, dst[c], src[0][c]);
1674		}
1675		break;
1676	case TGSI_OPCODE_ADD:
1677		for (c = 0; c < 4; c++) {
1678			if (!(mask & (1 << c)))
1679				continue;
1680			emit_add(pc, dst[c], src[0][c], src[1][c]);
1681		}
1682		break;
1683	case TGSI_OPCODE_BGNLOOP:
1684		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
1685		break;
1686	case TGSI_OPCODE_BRK:
1687		emit_branch(pc, -1, 0, NULL);
1688		assert(pc->loop_lvl > 0);
1689		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
1690		break;
1691	case TGSI_OPCODE_CEIL:
1692		for (c = 0; c < 4; c++) {
1693			if (!(mask & (1 << c)))
1694				continue;
1695			emit_cvt(pc, dst[c], src[0][c], -1,
1696				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
1697		}
1698		break;
1699	case TGSI_OPCODE_COS:
1700		if (mask & 8) {
1701			emit_precossin(pc, temp, src[0][3]);
1702			emit_flop(pc, 5, dst[3], temp);
1703			if (!(mask &= 7))
1704				break;
1705			if (temp == dst[3])
1706				temp = brdc = temp_temp(pc);
1707		}
1708		emit_precossin(pc, temp, src[0][0]);
1709		emit_flop(pc, 5, brdc, temp);
1710		break;
1711	case TGSI_OPCODE_DDX:
1712		for (c = 0; c < 4; c++) {
1713			if (!(mask & (1 << c)))
1714				continue;
1715			emit_ddx(pc, dst[c], src[0][c]);
1716		}
1717		break;
1718	case TGSI_OPCODE_DDY:
1719		for (c = 0; c < 4; c++) {
1720			if (!(mask & (1 << c)))
1721				continue;
1722			emit_ddy(pc, dst[c], src[0][c]);
1723		}
1724		break;
1725	case TGSI_OPCODE_DP3:
1726		emit_mul(pc, temp, src[0][0], src[1][0]);
1727		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1728		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1729		break;
1730	case TGSI_OPCODE_DP4:
1731		emit_mul(pc, temp, src[0][0], src[1][0]);
1732		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1733		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1734		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1735		break;
1736	case TGSI_OPCODE_DPH:
1737		emit_mul(pc, temp, src[0][0], src[1][0]);
1738		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1739		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1740		emit_add(pc, brdc, src[1][3], temp);
1741		break;
1742	case TGSI_OPCODE_DST:
1743		if (mask & (1 << 1))
1744			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1745		if (mask & (1 << 2))
1746			emit_mov(pc, dst[2], src[0][2]);
1747		if (mask & (1 << 3))
1748			emit_mov(pc, dst[3], src[1][3]);
1749		if (mask & (1 << 0))
1750			emit_mov_immdval(pc, dst[0], 1.0f);
1751		break;
1752	case TGSI_OPCODE_ELSE:
1753		emit_branch(pc, -1, 0, NULL);
1754		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1755		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1756		break;
1757	case TGSI_OPCODE_ENDIF:
1758		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1759
1760		/* try to replace branch over 1 insn with a predicated insn */
1761		if (nv50_kill_branch(pc) == TRUE)
1762			break;
1763
1764		if (pc->br_join[pc->if_lvl]) {
1765			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
1766			pc->br_join[pc->if_lvl] = NULL;
1767		}
1768		/* emit a NOP as join point, we could set it on the next
1769		 * one, but would have to make sure it is long and !immd
1770		 */
1771		emit_nop(pc);
1772		pc->p->exec_tail->inst[1] |= 2;
1773		break;
1774	case TGSI_OPCODE_ENDLOOP:
1775		emit_branch(pc, -1, 0, NULL);
1776		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
1777		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
1778		break;
1779	case TGSI_OPCODE_EX2:
1780		emit_preex2(pc, temp, src[0][0]);
1781		emit_flop(pc, 6, brdc, temp);
1782		break;
1783	case TGSI_OPCODE_FLR:
1784		for (c = 0; c < 4; c++) {
1785			if (!(mask & (1 << c)))
1786				continue;
1787			emit_flr(pc, dst[c], src[0][c]);
1788		}
1789		break;
1790	case TGSI_OPCODE_FRC:
1791		temp = temp_temp(pc);
1792		for (c = 0; c < 4; c++) {
1793			if (!(mask & (1 << c)))
1794				continue;
1795			emit_flr(pc, temp, src[0][c]);
1796			emit_sub(pc, dst[c], src[0][c], temp);
1797		}
1798		break;
1799	case TGSI_OPCODE_IF:
1800		/* emitting a join_at may not be necessary */
1801		assert(pc->if_lvl < MAX_IF_DEPTH);
1802		set_pred_wr(pc, 1, 0, pc->if_cond);
1803		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
1804		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1805		break;
1806	case TGSI_OPCODE_KIL:
1807		emit_kil(pc, src[0][0]);
1808		emit_kil(pc, src[0][1]);
1809		emit_kil(pc, src[0][2]);
1810		emit_kil(pc, src[0][3]);
1811		break;
1812	case TGSI_OPCODE_LIT:
1813		emit_lit(pc, &dst[0], mask, &src[0][0]);
1814		break;
1815	case TGSI_OPCODE_LG2:
1816		emit_flop(pc, 3, brdc, src[0][0]);
1817		break;
1818	case TGSI_OPCODE_LRP:
1819		temp = temp_temp(pc);
1820		for (c = 0; c < 4; c++) {
1821			if (!(mask & (1 << c)))
1822				continue;
1823			emit_sub(pc, temp, src[1][c], src[2][c]);
1824			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1825		}
1826		break;
1827	case TGSI_OPCODE_MAD:
1828		for (c = 0; c < 4; c++) {
1829			if (!(mask & (1 << c)))
1830				continue;
1831			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1832		}
1833		break;
1834	case TGSI_OPCODE_MAX:
1835		for (c = 0; c < 4; c++) {
1836			if (!(mask & (1 << c)))
1837				continue;
1838			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1839		}
1840		break;
1841	case TGSI_OPCODE_MIN:
1842		for (c = 0; c < 4; c++) {
1843			if (!(mask & (1 << c)))
1844				continue;
1845			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1846		}
1847		break;
1848	case TGSI_OPCODE_MOV:
1849	case TGSI_OPCODE_SWZ:
1850		for (c = 0; c < 4; c++) {
1851			if (!(mask & (1 << c)))
1852				continue;
1853			emit_mov(pc, dst[c], src[0][c]);
1854		}
1855		break;
1856	case TGSI_OPCODE_MUL:
1857		for (c = 0; c < 4; c++) {
1858			if (!(mask & (1 << c)))
1859				continue;
1860			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1861		}
1862		break;
1863	case TGSI_OPCODE_POW:
1864		emit_pow(pc, brdc, src[0][0], src[1][0]);
1865		break;
1866	case TGSI_OPCODE_RCP:
1867		emit_flop(pc, 0, brdc, src[0][0]);
1868		break;
1869	case TGSI_OPCODE_RSQ:
1870		emit_flop(pc, 2, brdc, src[0][0]);
1871		break;
1872	case TGSI_OPCODE_SCS:
1873		temp = temp_temp(pc);
1874		if (mask & 3)
1875			emit_precossin(pc, temp, src[0][0]);
1876		if (mask & (1 << 0))
1877			emit_flop(pc, 5, dst[0], temp);
1878		if (mask & (1 << 1))
1879			emit_flop(pc, 4, dst[1], temp);
1880		if (mask & (1 << 2))
1881			emit_mov_immdval(pc, dst[2], 0.0);
1882		if (mask & (1 << 3))
1883			emit_mov_immdval(pc, dst[3], 1.0);
1884		break;
1885	case TGSI_OPCODE_SIN:
1886		if (mask & 8) {
1887			emit_precossin(pc, temp, src[0][3]);
1888			emit_flop(pc, 4, dst[3], temp);
1889			if (!(mask &= 7))
1890				break;
1891			if (temp == dst[3])
1892				temp = brdc = temp_temp(pc);
1893		}
1894		emit_precossin(pc, temp, src[0][0]);
1895		emit_flop(pc, 4, brdc, temp);
1896		break;
1897	case TGSI_OPCODE_SLT:
1898	case TGSI_OPCODE_SGE:
1899	case TGSI_OPCODE_SEQ:
1900	case TGSI_OPCODE_SGT:
1901	case TGSI_OPCODE_SLE:
1902	case TGSI_OPCODE_SNE:
1903		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1904		for (c = 0; c < 4; c++) {
1905			if (!(mask & (1 << c)))
1906				continue;
1907			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1908		}
1909		break;
1910	case TGSI_OPCODE_SUB:
1911		for (c = 0; c < 4; c++) {
1912			if (!(mask & (1 << c)))
1913				continue;
1914			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1915		}
1916		break;
1917	case TGSI_OPCODE_TEX:
1918		emit_tex(pc, dst, mask, src[0], unit,
1919			 inst->InstructionExtTexture.Texture, FALSE);
1920		break;
1921	case TGSI_OPCODE_TXP:
1922		emit_tex(pc, dst, mask, src[0], unit,
1923			 inst->InstructionExtTexture.Texture, TRUE);
1924		break;
1925	case TGSI_OPCODE_TRUNC:
1926		for (c = 0; c < 4; c++) {
1927			if (!(mask & (1 << c)))
1928				continue;
1929			emit_cvt(pc, dst[c], src[0][c], -1,
1930				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
1931		}
1932		break;
1933	case TGSI_OPCODE_XPD:
1934		temp = temp_temp(pc);
1935		if (mask & (1 << 0)) {
1936			emit_mul(pc, temp, src[0][2], src[1][1]);
1937			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1938		}
1939		if (mask & (1 << 1)) {
1940			emit_mul(pc, temp, src[0][0], src[1][2]);
1941			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1942		}
1943		if (mask & (1 << 2)) {
1944			emit_mul(pc, temp, src[0][1], src[1][0]);
1945			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1946		}
1947		if (mask & (1 << 3))
1948			emit_mov_immdval(pc, dst[3], 1.0);
1949		break;
1950	case TGSI_OPCODE_END:
1951		break;
1952	default:
1953		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1954		return FALSE;
1955	}
1956
1957	if (brdc) {
1958		if (sat)
1959			emit_sat(pc, brdc, brdc);
1960		for (c = 0; c < 4; c++)
1961			if ((mask & (1 << c)) && dst[c] != brdc)
1962				emit_mov(pc, dst[c], brdc);
1963	} else
1964	if (sat) {
1965		for (c = 0; c < 4; c++) {
1966			if (!(mask & (1 << c)))
1967				continue;
1968			/* in this case we saturate later */
1969			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1970				continue;
1971			emit_sat(pc, rdst[c], dst[c]);
1972		}
1973	}
1974
1975	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1976		for (c = 0; c < 4; c++) {
1977			if (!src[i][c])
1978				continue;
1979			src[i][c]->neg = 0;
1980			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1981				FREE(src[i][c]);
1982		}
1983	}
1984
1985	kill_temp_temp(pc);
1986	return TRUE;
1987}
1988
1989static void
1990prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
1991{
1992	struct nv50_reg *reg = NULL;
1993	const struct tgsi_full_src_register *src;
1994	const struct tgsi_dst_register *dst;
1995	unsigned i, c, k, mask;
1996
1997	dst = &insn->FullDstRegisters[0].DstRegister;
1998	mask = dst->WriteMask;
1999
2000        if (dst->File == TGSI_FILE_TEMPORARY)
2001                reg = pc->temp;
2002        else
2003        if (dst->File == TGSI_FILE_OUTPUT)
2004                reg = pc->result;
2005
2006	if (reg) {
2007		for (c = 0; c < 4; c++) {
2008			if (!(mask & (1 << c)))
2009				continue;
2010			reg[dst->Index * 4 + c].acc = pc->insn_nr;
2011		}
2012	}
2013
2014	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2015		src = &insn->FullSrcRegisters[i];
2016
2017		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
2018			reg = pc->temp;
2019		else
2020		if (src->SrcRegister.File == TGSI_FILE_INPUT)
2021			reg = pc->attr;
2022		else
2023			continue;
2024
2025		mask = nv50_tgsi_src_mask(insn, i);
2026
2027		for (c = 0; c < 4; c++) {
2028			if (!(mask & (1 << c)))
2029				continue;
2030			k = tgsi_util_get_full_src_register_extswizzle(src, c);
2031
2032			if (k > TGSI_EXTSWIZZLE_W)
2033				continue;
2034
2035			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
2036		}
2037	}
2038}
2039
2040/* Returns a bitmask indicating which dst components need to be
2041 * written to temporaries first to avoid 'corrupting' sources.
2042 *
2043 * m[i]   (out) indicate component to write in the i-th position
2044 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2045 */
2046static unsigned
2047nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2048{
2049	unsigned i, c, x, unsafe;
2050
2051	for (c = 0; c < 4; c++)
2052		m[c] = c;
2053
2054	/* Swap as long as a dst component written earlier is depended on
2055	 * by one written later, but the next one isn't depended on by it.
2056	 */
2057	for (c = 0; c < 3; c++) {
2058		if (rdep[m[c + 1]] & (1 << m[c]))
2059			continue; /* if next one is depended on by us */
2060		for (i = c + 1; i < 4; i++)
2061			/* if we are depended on by a later one */
2062			if (rdep[m[c]] & (1 << m[i]))
2063				break;
2064		if (i == 4)
2065			continue;
2066		/* now, swap */
2067		x = m[c];
2068		m[c] = m[c + 1];
2069		m[c + 1] = x;
2070
2071		/* restart */
2072		c = 0;
2073	}
2074
2075	/* mark dependencies that could not be resolved by reordering */
2076	for (i = 0; i < 3; ++i)
2077		for (c = i + 1; c < 4; ++c)
2078			if (rdep[m[i]] & (1 << m[c]))
2079				unsafe |= (1 << i);
2080
2081	/* NOTE: $unsafe is with respect to order, not component */
2082	return unsafe;
2083}
2084
2085/* Select a suitable dst register for broadcasting scalar results,
2086 * or return NULL if we have to allocate an extra TEMP.
2087 *
2088 * If e.g. only 1 component is written, we may also emit the final
2089 * result to a write-only register.
2090 */
2091static struct nv50_reg *
2092tgsi_broadcast_dst(struct nv50_pc *pc,
2093		   const struct tgsi_full_dst_register *fd, unsigned mask)
2094{
2095	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
2096		int c = ffs(~mask & fd->DstRegister.WriteMask);
2097		if (c)
2098			return tgsi_dst(pc, c - 1, fd);
2099	} else {
2100		int c = ffs(fd->DstRegister.WriteMask) - 1;
2101		if ((1 << c) == fd->DstRegister.WriteMask)
2102			return tgsi_dst(pc, c, fd);
2103	}
2104
2105	return NULL;
2106}
2107
2108/* Scan source swizzles and return a bitmask indicating dst regs that
2109 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2110 */
2111static unsigned
2112nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2113		       unsigned rdep[4])
2114{
2115	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
2116	const struct tgsi_full_src_register *fs;
2117	unsigned i, deqs = 0;
2118
2119	for (i = 0; i < 4; ++i)
2120		rdep[i] = 0;
2121
2122	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2123		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2124		boolean neg_supp = negate_supported(insn, i);
2125
2126		fs = &insn->FullSrcRegisters[i];
2127		if (fs->SrcRegister.File != fd->DstRegister.File ||
2128		    fs->SrcRegister.Index != fd->DstRegister.Index)
2129			continue;
2130
2131		for (chn = 0; chn < 4; ++chn) {
2132			unsigned s, c;
2133
2134			if (!(mask & (1 << chn))) /* src is not read */
2135				continue;
2136			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
2137			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2138
2139			if (c > TGSI_EXTSWIZZLE_W ||
2140			    !(fd->DstRegister.WriteMask & (1 << c)))
2141				continue;
2142
2143			/* no danger if src is copied to TEMP first */
2144			if ((s != TGSI_UTIL_SIGN_KEEP) &&
2145			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2146				continue;
2147
2148			rdep[c] |= nv50_tgsi_dst_revdep(
2149				insn->Instruction.Opcode, i, chn);
2150			deqs |= (1 << c);
2151		}
2152	}
2153
2154	return deqs;
2155}
2156
2157static boolean
2158nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2159{
2160	struct tgsi_full_instruction insn = tok->FullInstruction;
2161	const struct tgsi_full_dst_register *fd;
2162	unsigned i, deqs, rdep[4], m[4];
2163
2164	fd = &tok->FullInstruction.FullDstRegisters[0];
2165	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2166
2167	if (is_scalar_op(insn.Instruction.Opcode)) {
2168		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2169		if (!pc->r_brdc)
2170			pc->r_brdc = temp_temp(pc);
2171		return nv50_program_tx_insn(pc, &insn);
2172	}
2173	pc->r_brdc = NULL;
2174
2175	if (!deqs)
2176		return nv50_program_tx_insn(pc, &insn);
2177
2178	deqs = nv50_revdep_reorder(m, rdep);
2179
2180	for (i = 0; i < 4; ++i) {
2181		assert(pc->r_dst[m[i]] == NULL);
2182
2183		insn.FullDstRegisters[0].DstRegister.WriteMask =
2184			fd->DstRegister.WriteMask & (1 << m[i]);
2185
2186		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
2187			continue;
2188
2189		if (deqs & (1 << i))
2190			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2191
2192		if (!nv50_program_tx_insn(pc, &insn))
2193			return FALSE;
2194	}
2195
2196	for (i = 0; i < 4; i++) {
2197		struct nv50_reg *reg = pc->r_dst[i];
2198		if (!reg)
2199			continue;
2200		pc->r_dst[i] = NULL;
2201
2202		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2203			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2204		else
2205			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2206		free_temp(pc, reg);
2207	}
2208
2209	return TRUE;
2210}
2211
2212static void
2213load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2214{
2215	struct nv50_reg *iv, **ppiv;
2216	unsigned mode = pc->interp_mode[reg->index];
2217
2218	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2219	iv = *ppiv;
2220
2221	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2222		iv = *ppiv = alloc_temp(pc, NULL);
2223		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2224
2225		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2226		emit_flop(pc, 0, iv, iv);
2227
2228		/* XXX: when loading interpolants dynamically, move these
2229		 * to the program head, or make sure it can't be skipped.
2230		 */
2231	}
2232
2233	emit_interp(pc, reg, iv, mode);
2234}
2235
2236static boolean
2237nv50_program_tx_prep(struct nv50_pc *pc)
2238{
2239	struct tgsi_parse_context tp;
2240	struct nv50_program *p = pc->p;
2241	boolean ret = FALSE;
2242	unsigned i, c, flat_nr = 0;
2243
2244	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2245	while (!tgsi_parse_end_of_tokens(&tp)) {
2246		const union tgsi_full_token *tok = &tp.FullToken;
2247
2248		tgsi_parse_token(&tp);
2249		switch (tok->Token.Type) {
2250		case TGSI_TOKEN_TYPE_IMMEDIATE:
2251		{
2252			const struct tgsi_full_immediate *imm =
2253				&tp.FullToken.FullImmediate;
2254
2255			ctor_immd(pc, imm->u[0].Float,
2256				      imm->u[1].Float,
2257				      imm->u[2].Float,
2258				      imm->u[3].Float);
2259		}
2260			break;
2261		case TGSI_TOKEN_TYPE_DECLARATION:
2262		{
2263			const struct tgsi_full_declaration *d;
2264			unsigned si, last, first, mode;
2265
2266			d = &tp.FullToken.FullDeclaration;
2267			first = d->DeclarationRange.First;
2268			last = d->DeclarationRange.Last;
2269
2270			switch (d->Declaration.File) {
2271			case TGSI_FILE_TEMPORARY:
2272				break;
2273			case TGSI_FILE_OUTPUT:
2274				if (!d->Declaration.Semantic ||
2275				    p->type == PIPE_SHADER_FRAGMENT)
2276					break;
2277
2278				si = d->Semantic.SemanticIndex;
2279				switch (d->Semantic.SemanticName) {
2280				case TGSI_SEMANTIC_BCOLOR:
2281					p->cfg.two_side[si].hw = first;
2282					if (p->cfg.io_nr > first)
2283						p->cfg.io_nr = first;
2284					break;
2285				case TGSI_SEMANTIC_PSIZE:
2286					p->cfg.psiz = first;
2287					if (p->cfg.io_nr > first)
2288						p->cfg.io_nr = first;
2289					break;
2290					/*
2291				case TGSI_SEMANTIC_CLIP_DISTANCE:
2292					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2293					break;
2294					*/
2295				default:
2296					break;
2297				}
2298				break;
2299			case TGSI_FILE_INPUT:
2300			{
2301				if (p->type != PIPE_SHADER_FRAGMENT)
2302					break;
2303
2304				switch (d->Declaration.Interpolate) {
2305				case TGSI_INTERPOLATE_CONSTANT:
2306					mode = INTERP_FLAT;
2307					flat_nr++;
2308					break;
2309				case TGSI_INTERPOLATE_PERSPECTIVE:
2310					mode = INTERP_PERSPECTIVE;
2311					p->cfg.regs[1] |= 0x08 << 24;
2312					break;
2313				default:
2314					mode = INTERP_LINEAR;
2315					break;
2316				}
2317				if (d->Declaration.Centroid)
2318					mode |= INTERP_CENTROID;
2319
2320				assert(last < 32);
2321				for (i = first; i <= last; i++)
2322					pc->interp_mode[i] = mode;
2323			}
2324				break;
2325			case TGSI_FILE_CONSTANT:
2326				break;
2327			case TGSI_FILE_SAMPLER:
2328				break;
2329			default:
2330				NOUVEAU_ERR("bad decl file %d\n",
2331					    d->Declaration.File);
2332				goto out_err;
2333			}
2334		}
2335			break;
2336		case TGSI_TOKEN_TYPE_INSTRUCTION:
2337			pc->insn_nr++;
2338			prep_inspect_insn(pc, &tok->FullInstruction);
2339			break;
2340		default:
2341			break;
2342		}
2343	}
2344
2345	if (p->type == PIPE_SHADER_VERTEX) {
2346		int rid = 0;
2347
2348		for (i = 0; i < pc->attr_nr * 4; ++i) {
2349			if (pc->attr[i].acc) {
2350				pc->attr[i].hw = rid++;
2351				p->cfg.attr[i / 32] |= 1 << (i % 32);
2352			}
2353		}
2354
2355		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2356			p->cfg.io[i].hw = rid;
2357			p->cfg.io[i].id_vp = i;
2358
2359			for (c = 0; c < 4; ++c) {
2360				int n = i * 4 + c;
2361				if (!pc->result[n].acc)
2362					continue;
2363				pc->result[n].hw = rid++;
2364				p->cfg.io[i].mask |= 1 << c;
2365			}
2366		}
2367
2368		for (c = 0; c < 2; ++c)
2369			if (p->cfg.two_side[c].hw < 0x40)
2370				p->cfg.two_side[c] = p->cfg.io[
2371					p->cfg.two_side[c].hw];
2372
2373		if (p->cfg.psiz < 0x40)
2374			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2375	} else
2376	if (p->type == PIPE_SHADER_FRAGMENT) {
2377		int rid, aid;
2378		unsigned n = 0, m = pc->attr_nr - flat_nr;
2379
2380		int base = (TGSI_SEMANTIC_POSITION ==
2381			    p->info.input_semantic_name[0]) ? 0 : 1;
2382
2383		/* non-flat interpolants have to be mapped to
2384		 * the lower hardware IDs, so sort them:
2385		 */
2386		for (i = 0; i < pc->attr_nr; i++) {
2387			if (pc->interp_mode[i] == INTERP_FLAT) {
2388				p->cfg.io[m].id_vp = i + base;
2389				p->cfg.io[m++].id_fp = i;
2390			} else {
2391				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2392					p->cfg.io[n].linear = TRUE;
2393				p->cfg.io[n].id_vp = i + base;
2394				p->cfg.io[n++].id_fp = i;
2395			}
2396		}
2397
2398		if (!base) /* set w-coordinate mask from perspective interp */
2399			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2400
2401		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2402			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2403
2404		for (n = 0; n < pc->attr_nr; ++n) {
2405			p->cfg.io[n].hw = rid = aid;
2406			i = p->cfg.io[n].id_fp;
2407
2408			for (c = 0; c < 4; ++c) {
2409				if (!pc->attr[i * 4 + c].acc)
2410					continue;
2411				pc->attr[i * 4 + c].rhw = rid++;
2412				p->cfg.io[n].mask |= 1 << c;
2413
2414				load_interpolant(pc, &pc->attr[i * 4 + c]);
2415			}
2416			aid += popcnt4(p->cfg.io[n].mask);
2417		}
2418
2419		if (!base)
2420			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2421
2422		m = popcnt4(p->cfg.regs[1] >> 24);
2423
2424		/* set count of non-position inputs and of non-flat
2425		 * non-position inputs for FP_INTERPOLANT_CTRL
2426		 */
2427		p->cfg.regs[1] |= aid - m;
2428
2429		if (flat_nr) {
2430			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2431			p->cfg.regs[1] |= (i - m) << 16;
2432		} else
2433			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2434
2435		/* mark color semantic for light-twoside */
2436		n = 0x40;
2437		for (i = 0; i < pc->attr_nr; i++) {
2438			ubyte si, sn;
2439
2440			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2441			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2442
2443			if (sn == TGSI_SEMANTIC_COLOR) {
2444				p->cfg.two_side[si] = p->cfg.io[i];
2445
2446				/* increase colour count */
2447				p->cfg.regs[0] += popcnt4(
2448					p->cfg.two_side[si].mask) << 16;
2449
2450				n = MIN2(n, p->cfg.io[i].hw - m);
2451			}
2452		}
2453		if (n < 0x40)
2454			p->cfg.regs[0] += n;
2455
2456		/* Initialize FP results:
2457		 * FragDepth is always first TGSI and last hw output
2458		 */
2459		i = p->info.writes_z ? 4 : 0;
2460		for (rid = 0; i < pc->result_nr * 4; i++)
2461			pc->result[i].rhw = rid++;
2462		if (p->info.writes_z)
2463			pc->result[2].rhw = rid;
2464
2465		p->cfg.high_result = rid;
2466	}
2467
2468	if (pc->immd_nr) {
2469		int rid = 0;
2470
2471		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2472		if (!pc->immd)
2473			goto out_err;
2474
2475		for (i = 0; i < pc->immd_nr; i++) {
2476			for (c = 0; c < 4; c++, rid++)
2477				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2478		}
2479	}
2480
2481	ret = TRUE;
2482out_err:
2483	if (pc->iv_p)
2484		free_temp(pc, pc->iv_p);
2485	if (pc->iv_c)
2486		free_temp(pc, pc->iv_c);
2487
2488	tgsi_parse_free(&tp);
2489	return ret;
2490}
2491
2492static void
2493free_nv50_pc(struct nv50_pc *pc)
2494{
2495	if (pc->immd)
2496		FREE(pc->immd);
2497	if (pc->param)
2498		FREE(pc->param);
2499	if (pc->result)
2500		FREE(pc->result);
2501	if (pc->attr)
2502		FREE(pc->attr);
2503	if (pc->temp)
2504		FREE(pc->temp);
2505
2506	FREE(pc);
2507}
2508
2509static boolean
2510ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2511{
2512	int i, c;
2513	unsigned rtype[2] = { P_ATTR, P_RESULT };
2514
2515	pc->p = p;
2516	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2517	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2518	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2519	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2520
2521	p->cfg.high_temp = 4;
2522
2523	p->cfg.two_side[0].hw = 0x40;
2524	p->cfg.two_side[1].hw = 0x40;
2525
2526	switch (p->type) {
2527	case PIPE_SHADER_VERTEX:
2528		p->cfg.psiz = 0x40;
2529		p->cfg.clpd = 0x40;
2530		p->cfg.io_nr = pc->result_nr;
2531		break;
2532	case PIPE_SHADER_FRAGMENT:
2533		rtype[0] = rtype[1] = P_TEMP;
2534
2535		p->cfg.regs[0] = 0x01000004;
2536		p->cfg.io_nr = pc->attr_nr;
2537
2538		if (p->info.writes_z) {
2539			p->cfg.regs[2] |= 0x00000100;
2540			p->cfg.regs[3] |= 0x00000011;
2541		}
2542		if (p->info.uses_kill)
2543			p->cfg.regs[2] |= 0x00100000;
2544		break;
2545	}
2546
2547	if (pc->temp_nr) {
2548		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2549		if (!pc->temp)
2550			return FALSE;
2551
2552		for (i = 0; i < pc->temp_nr * 4; ++i)
2553			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2554	}
2555
2556	if (pc->attr_nr) {
2557		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2558		if (!pc->attr)
2559			return FALSE;
2560
2561		for (i = 0; i < pc->attr_nr * 4; ++i)
2562			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2563	}
2564
2565	if (pc->result_nr) {
2566		unsigned nr = pc->result_nr * 4;
2567
2568		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2569		if (!pc->result)
2570			return FALSE;
2571
2572		for (i = 0; i < nr; ++i)
2573			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2574	}
2575
2576	if (pc->param_nr) {
2577		int rid = 0;
2578
2579		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2580		if (!pc->param)
2581			return FALSE;
2582
2583		for (i = 0; i < pc->param_nr; ++i)
2584			for (c = 0; c < 4; ++c, ++rid)
2585				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2586	}
2587
2588	return TRUE;
2589}
2590
2591static void
2592nv50_fp_move_results(struct nv50_pc *pc)
2593{
2594	struct nv50_reg reg;
2595	unsigned i;
2596
2597	ctor_reg(&reg, P_TEMP, -1, -1);
2598
2599	for (i = 0; i < pc->result_nr * 4; ++i) {
2600		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2601			continue;
2602		if (pc->result[i].rhw != pc->result[i].hw) {
2603			reg.hw = pc->result[i].rhw;
2604			emit_mov(pc, &reg, &pc->result[i]);
2605		}
2606	}
2607}
2608
2609static void
2610nv50_program_fixup_insns(struct nv50_pc *pc)
2611{
2612	struct nv50_program_exec *e, *prev = NULL, **bra_list;
2613	unsigned i, n, pos;
2614
2615	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2616
2617	/* Collect branch instructions, we need to adjust their offsets
2618	 * when converting 32 bit instructions to 64 bit ones
2619	 */
2620	for (n = 0, e = pc->p->exec_head; e; e = e->next)
2621		if (e->param.index >= 0 && !e->param.mask)
2622			bra_list[n++] = e;
2623
2624	/* Make sure we don't have any single 32 bit instructions. */
2625	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2626		pos += is_long(e) ? 2 : 1;
2627
2628		if ((pos & 1) && (!e->next || is_long(e->next))) {
2629			for (i = 0; i < n; ++i)
2630				if (bra_list[i]->param.index >= pos)
2631					bra_list[i]->param.index += 1;
2632			convert_to_long(pc, e);
2633			++pos;
2634		}
2635		if (e->next)
2636			prev = e;
2637	}
2638
2639	assert(!is_immd(pc->p->exec_head));
2640	assert(!is_immd(pc->p->exec_tail));
2641
2642	/* last instruction must be long so it can have the end bit set */
2643	if (!is_long(pc->p->exec_tail)) {
2644		convert_to_long(pc, pc->p->exec_tail);
2645		if (prev)
2646			convert_to_long(pc, prev);
2647	}
2648	assert(!(pc->p->exec_tail->inst[1] & 2));
2649	/* set the end-bit */
2650	pc->p->exec_tail->inst[1] |= 1;
2651
2652	FREE(bra_list);
2653}
2654
2655static boolean
2656nv50_program_tx(struct nv50_program *p)
2657{
2658	struct tgsi_parse_context parse;
2659	struct nv50_pc *pc;
2660	boolean ret;
2661
2662	pc = CALLOC_STRUCT(nv50_pc);
2663	if (!pc)
2664		return FALSE;
2665
2666	ret = ctor_nv50_pc(pc, p);
2667	if (ret == FALSE)
2668		goto out_cleanup;
2669
2670	ret = nv50_program_tx_prep(pc);
2671	if (ret == FALSE)
2672		goto out_cleanup;
2673
2674	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2675	while (!tgsi_parse_end_of_tokens(&parse)) {
2676		const union tgsi_full_token *tok = &parse.FullToken;
2677
2678		/* don't allow half insn/immd on first and last instruction */
2679		pc->allow32 = TRUE;
2680		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2681			pc->allow32 = FALSE;
2682
2683		tgsi_parse_token(&parse);
2684
2685		switch (tok->Token.Type) {
2686		case TGSI_TOKEN_TYPE_INSTRUCTION:
2687			++pc->insn_cur;
2688			ret = nv50_tgsi_insn(pc, tok);
2689			if (ret == FALSE)
2690				goto out_err;
2691			break;
2692		default:
2693			break;
2694		}
2695	}
2696
2697	if (pc->p->type == PIPE_SHADER_FRAGMENT)
2698		nv50_fp_move_results(pc);
2699
2700	nv50_program_fixup_insns(pc);
2701
2702	p->param_nr = pc->param_nr * 4;
2703	p->immd_nr = pc->immd_nr * 4;
2704	p->immd = pc->immd_buf;
2705
2706out_err:
2707	tgsi_parse_free(&parse);
2708
2709out_cleanup:
2710	free_nv50_pc(pc);
2711	return ret;
2712}
2713
2714static void
2715nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2716{
2717	if (nv50_program_tx(p) == FALSE)
2718		assert(0);
2719	p->translated = TRUE;
2720}
2721
2722static void
2723nv50_program_upload_data(struct nv50_context *nv50, float *map,
2724			unsigned start, unsigned count, unsigned cbuf)
2725{
2726	struct nouveau_channel *chan = nv50->screen->base.channel;
2727	struct nouveau_grobj *tesla = nv50->screen->tesla;
2728
2729	while (count) {
2730		unsigned nr = count > 2047 ? 2047 : count;
2731
2732		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2733		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2734		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2735		OUT_RINGp (chan, map, nr);
2736
2737		map += nr;
2738		start += nr;
2739		count -= nr;
2740	}
2741}
2742
2743static void
2744nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2745{
2746	struct pipe_screen *pscreen = nv50->pipe.screen;
2747
2748	if (!p->data[0] && p->immd_nr) {
2749		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2750
2751		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2752			while (heap->next && heap->size < p->immd_nr) {
2753				struct nv50_program *evict = heap->next->priv;
2754				nouveau_resource_free(&evict->data[0]);
2755			}
2756
2757			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2758						   &p->data[0]))
2759				assert(0);
2760		}
2761
2762		/* immediates only need to be uploaded again when freed */
2763		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2764					 p->immd_nr, NV50_CB_PMISC);
2765	}
2766
2767	assert(p->param_nr <= 128);
2768
2769	if (p->param_nr) {
2770		unsigned cb;
2771		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2772					     PIPE_BUFFER_USAGE_CPU_READ);
2773
2774		if (p->type == PIPE_SHADER_VERTEX)
2775			cb = NV50_CB_PVP;
2776		else
2777			cb = NV50_CB_PFP;
2778
2779		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
2780		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2781	}
2782}
2783
2784static void
2785nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2786{
2787	struct nouveau_channel *chan = nv50->screen->base.channel;
2788	struct nouveau_grobj *tesla = nv50->screen->tesla;
2789	struct nv50_program_exec *e;
2790	struct nouveau_stateobj *so;
2791	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2792	unsigned start, count, *up, *ptr;
2793	boolean upload = FALSE;
2794
2795	if (!p->bo) {
2796		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2797			       p->exec_size * 4, &p->bo);
2798		upload = TRUE;
2799	}
2800
2801	if (p->data[0] && p->data[0]->start != p->data_start[0])
2802		upload = TRUE;
2803
2804	if (!upload)
2805		return;
2806
2807	for (e = p->exec_head; e; e = e->next) {
2808		unsigned ei, ci, bs;
2809
2810		if (e->param.index < 0)
2811			continue;
2812
2813		if (e->param.mask == 0) {
2814			assert(!(e->param.index & 1));
2815			/* seem to be 8 byte steps */
2816			ei = (e->param.index >> 1) + 0 /* START_ID */;
2817
2818			e->inst[0] &= 0xf0000fff;
2819			e->inst[0] |= ei << 12;
2820			continue;
2821		}
2822
2823		bs = (e->inst[1] >> 22) & 0x07;
2824		assert(bs < 2);
2825		ei = e->param.shift >> 5;
2826		ci = e->param.index;
2827		if (bs == 0)
2828			ci += p->data[bs]->start;
2829
2830		e->inst[ei] &= ~e->param.mask;
2831		e->inst[ei] |= (ci << e->param.shift);
2832	}
2833
2834	if (p->data[0])
2835		p->data_start[0] = p->data[0]->start;
2836
2837#ifdef NV50_PROGRAM_DUMP
2838	NOUVEAU_ERR("-------\n");
2839	for (e = p->exec_head; e; e = e->next) {
2840		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2841		if (is_long(e))
2842			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2843	}
2844#endif
2845
2846	up = ptr = MALLOC(p->exec_size * 4);
2847	for (e = p->exec_head; e; e = e->next) {
2848		*(ptr++) = e->inst[0];
2849		if (is_long(e))
2850			*(ptr++) = e->inst[1];
2851	}
2852
2853	so = so_new(4,2);
2854	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2855	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2856	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2857	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2858
2859	start = 0; count = p->exec_size;
2860	while (count) {
2861		struct nouveau_channel *chan = nv50->screen->base.channel;
2862		unsigned nr;
2863
2864		so_emit(chan, so);
2865
2866		nr = MIN2(count, 2047);
2867		nr = MIN2(chan->pushbuf->remaining, nr);
2868		if (chan->pushbuf->remaining < (nr + 3)) {
2869			FIRE_RING(chan);
2870			continue;
2871		}
2872
2873		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2874		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2875		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2876		OUT_RINGp (chan, up + start, nr);
2877
2878		start += nr;
2879		count -= nr;
2880	}
2881
2882	FREE(up);
2883	so_ref(NULL, &so);
2884}
2885
2886void
2887nv50_vertprog_validate(struct nv50_context *nv50)
2888{
2889	struct nouveau_grobj *tesla = nv50->screen->tesla;
2890	struct nv50_program *p = nv50->vertprog;
2891	struct nouveau_stateobj *so;
2892
2893	if (!p->translated) {
2894		nv50_program_validate(nv50, p);
2895		if (!p->translated)
2896			assert(0);
2897	}
2898
2899	nv50_program_validate_data(nv50, p);
2900	nv50_program_validate_code(nv50, p);
2901
2902	so = so_new(13, 2);
2903	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2904	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2905		      NOUVEAU_BO_HIGH, 0, 0);
2906	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2907		      NOUVEAU_BO_LOW, 0, 0);
2908	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2909	so_data  (so, p->cfg.attr[0]);
2910	so_data  (so, p->cfg.attr[1]);
2911	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2912	so_data  (so, p->cfg.high_result);
2913	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2914	so_data  (so, p->cfg.high_result); //8);
2915	so_data  (so, p->cfg.high_temp);
2916	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2917	so_data  (so, 0); /* program start offset */
2918	so_ref(so, &nv50->state.vertprog);
2919	so_ref(NULL, &so);
2920}
2921
2922void
2923nv50_fragprog_validate(struct nv50_context *nv50)
2924{
2925	struct nouveau_grobj *tesla = nv50->screen->tesla;
2926	struct nv50_program *p = nv50->fragprog;
2927	struct nouveau_stateobj *so;
2928
2929	if (!p->translated) {
2930		nv50_program_validate(nv50, p);
2931		if (!p->translated)
2932			assert(0);
2933	}
2934
2935	nv50_program_validate_data(nv50, p);
2936	nv50_program_validate_code(nv50, p);
2937
2938	so = so_new(64, 2);
2939	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2940	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2941		      NOUVEAU_BO_HIGH, 0, 0);
2942	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2943		      NOUVEAU_BO_LOW, 0, 0);
2944	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2945	so_data  (so, p->cfg.high_temp);
2946	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2947	so_data  (so, p->cfg.high_result);
2948	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2949	so_data  (so, p->cfg.regs[2]);
2950	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2951	so_data  (so, p->cfg.regs[3]);
2952	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2953	so_data  (so, 0); /* program start offset */
2954	so_ref(so, &nv50->state.fragprog);
2955	so_ref(NULL, &so);
2956}
2957
2958static void
2959nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
2960{
2961	struct nv50_program *fp = nv50->fragprog;
2962	struct nv50_program *vp = nv50->vertprog;
2963	unsigned i, c, m = base;
2964
2965	/* XXX: This can't work correctly in all cases yet, we either
2966	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
2967	 * to be per FP input instead of per VP output
2968	 */
2969	memset(pntc, 0, 8 * sizeof(uint32_t));
2970
2971	for (i = 0; i < fp->cfg.io_nr; i++) {
2972		uint8_t sn, si;
2973		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
2974		unsigned n = popcnt4(fp->cfg.io[i].mask);
2975
2976		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
2977			m += n;
2978			continue;
2979		}
2980
2981		sn = vp->info.input_semantic_name[j];
2982		si = vp->info.input_semantic_index[j];
2983
2984		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
2985			ubyte mode =
2986				nv50->rasterizer->pipe.sprite_coord_mode[si];
2987
2988			if (mode == PIPE_SPRITE_COORD_NONE) {
2989				m += n;
2990				continue;
2991			}
2992		}
2993
2994		/* this is either PointCoord or replaced by sprite coords */
2995		for (c = 0; c < 4; c++) {
2996			if (!(fp->cfg.io[i].mask & (1 << c)))
2997				continue;
2998			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
2999			++m;
3000		}
3001	}
3002}
3003
3004static int
3005nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3006	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3007{
3008	int c;
3009	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3010	uint8_t *map = (uint8_t *)p_map;
3011
3012	for (c = 0; c < 4; ++c) {
3013		if (mf & 1) {
3014			if (fpi->linear == TRUE)
3015				lin[mid / 32] |= 1 << (mid % 32);
3016			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3017		}
3018
3019		oid += mv & 1;
3020		mf >>= 1;
3021		mv >>= 1;
3022	}
3023
3024	return mid;
3025}
3026
3027void
3028nv50_linkage_validate(struct nv50_context *nv50)
3029{
3030	struct nouveau_grobj *tesla = nv50->screen->tesla;
3031	struct nv50_program *vp = nv50->vertprog;
3032	struct nv50_program *fp = nv50->fragprog;
3033	struct nouveau_stateobj *so;
3034	struct nv50_sreg4 dummy, *vpo;
3035	int i, n, c, m = 0;
3036	uint32_t map[16], lin[4], reg[5], pcrd[8];
3037
3038	memset(map, 0, sizeof(map));
3039	memset(lin, 0, sizeof(lin));
3040
3041	reg[1] = 0x00000004; /* low and high clip distance map ids */
3042	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3043	reg[3] = 0x00000000; /* point size map id & enable */
3044	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3045	reg[4] = fp->cfg.regs[1]; /* interpolant info */
3046
3047	dummy.linear = FALSE;
3048	dummy.mask = 0xf; /* map all components of HPOS */
3049	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3050
3051	dummy.mask = 0x0;
3052
3053	if (vp->cfg.clpd < 0x40) {
3054		for (c = 0; c < vp->cfg.clpd_nr; ++c)
3055			map[m++] = vp->cfg.clpd + c;
3056		reg[1] = (m << 8);
3057	}
3058
3059	reg[0] |= m << 8; /* adjust BFC0 id */
3060
3061	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3062	if (nv50->rasterizer->pipe.light_twoside) {
3063		vpo = &vp->cfg.two_side[0];
3064
3065		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3066		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3067	}
3068
3069	reg[0] += m - 4; /* adjust FFC0 id */
3070	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3071
3072	i = 0;
3073	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
3074		i = 1;
3075	for (; i < fp->cfg.io_nr; i++) {
3076		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
3077		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
3078
3079		n = fp->cfg.io[i].id_vp;
3080		if (n >= vp->cfg.io_nr ||
3081		    vp->info.output_semantic_name[n] != sn ||
3082		    vp->info.output_semantic_index[n] != si)
3083			vpo = &dummy;
3084		else
3085			vpo = &vp->cfg.io[n];
3086
3087		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3088	}
3089
3090	if (nv50->rasterizer->pipe.point_size_per_vertex) {
3091		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3092		reg[3] = (m++ << 4) | 1;
3093	}
3094
3095	/* now fill the stateobj */
3096	so = so_new(64, 0);
3097
3098	n = (m + 3) / 4;
3099	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3100	so_data  (so, m);
3101	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3102	so_datap (so, map, n);
3103
3104	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3105	so_datap (so, reg, 4);
3106
3107	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3108	so_data  (so, reg[4]);
3109
3110	so_method(so, tesla, 0x1540, 4);
3111	so_datap (so, lin, 4);
3112
3113	if (nv50->rasterizer->pipe.point_sprite) {
3114		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3115
3116		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3117		so_datap (so, pcrd, 8);
3118	}
3119
3120        so_ref(so, &nv50->state.programs);
3121        so_ref(NULL, &so);
3122}
3123
3124void
3125nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3126{
3127	while (p->exec_head) {
3128		struct nv50_program_exec *e = p->exec_head;
3129
3130		p->exec_head = e->next;
3131		FREE(e);
3132	}
3133	p->exec_tail = NULL;
3134	p->exec_size = 0;
3135
3136	nouveau_bo_ref(NULL, &p->bo);
3137
3138	nouveau_resource_free(&p->data[0]);
3139
3140	p->translated = 0;
3141}
3142