nv50_program.c revision 2eef2017acbbb617c559555648c7745141f3aedb
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93/* arbitrary limits */
94#define MAX_IF_DEPTH 4
95#define MAX_LOOP_DEPTH 4
96
97struct nv50_pc {
98	struct nv50_program *p;
99
100	/* hw resources */
101	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
102
103	/* tgsi resources */
104	struct nv50_reg *temp;
105	int temp_nr;
106	struct nv50_reg *attr;
107	int attr_nr;
108	struct nv50_reg *result;
109	int result_nr;
110	struct nv50_reg *param;
111	int param_nr;
112	struct nv50_reg *immd;
113	float *immd_buf;
114	int immd_nr;
115
116	struct nv50_reg *temp_temp[16];
117	unsigned temp_temp_nr;
118
119	/* broadcast and destination replacement regs */
120	struct nv50_reg *r_brdc;
121	struct nv50_reg *r_dst[4];
122
123	unsigned interp_mode[32];
124	/* perspective interpolation registers */
125	struct nv50_reg *iv_p;
126	struct nv50_reg *iv_c;
127
128	struct nv50_program_exec *if_cond;
129	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
130	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
131	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
132	int if_lvl, loop_lvl;
133	unsigned loop_pos[MAX_LOOP_DEPTH];
134
135	/* current instruction and total number of insns */
136	unsigned insn_cur;
137	unsigned insn_nr;
138
139	boolean allow32;
140};
141
142static INLINE void
143ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
144{
145	reg->type = type;
146	reg->index = index;
147	reg->hw = hw;
148	reg->neg = 0;
149	reg->rhw = -1;
150	reg->acc = 0;
151}
152
153static INLINE unsigned
154popcnt4(uint32_t val)
155{
156	static const unsigned cnt[16]
157	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
158	return cnt[val & 0xf];
159}
160
161static void
162alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
163{
164	int i = 0;
165
166	if (reg->type == P_RESULT) {
167		if (pc->p->cfg.high_result < (reg->hw + 1))
168			pc->p->cfg.high_result = reg->hw + 1;
169	}
170
171	if (reg->type != P_TEMP)
172		return;
173
174	if (reg->hw >= 0) {
175		/*XXX: do this here too to catch FP temp-as-attr usage..
176		 *     not clean, but works */
177		if (pc->p->cfg.high_temp < (reg->hw + 1))
178			pc->p->cfg.high_temp = reg->hw + 1;
179		return;
180	}
181
182	if (reg->rhw != -1) {
183		/* try to allocate temporary with index rhw first */
184		if (!(pc->r_temp[reg->rhw])) {
185			pc->r_temp[reg->rhw] = reg;
186			reg->hw = reg->rhw;
187			if (pc->p->cfg.high_temp < (reg->rhw + 1))
188				pc->p->cfg.high_temp = reg->rhw + 1;
189			return;
190		}
191		/* make sure we don't get things like $r0 needs to go
192		 * in $r1 and $r1 in $r0
193		 */
194		i = pc->result_nr * 4;
195	}
196
197	for (; i < NV50_SU_MAX_TEMP; i++) {
198		if (!(pc->r_temp[i])) {
199			pc->r_temp[i] = reg;
200			reg->hw = i;
201			if (pc->p->cfg.high_temp < (i + 1))
202				pc->p->cfg.high_temp = i + 1;
203			return;
204		}
205	}
206
207	assert(0);
208}
209
210/* XXX: For shaders that aren't executed linearly (e.g. shaders that
211 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
212 * lest we risk temp_temps overwriting regs alloc'd "later".
213 */
214static struct nv50_reg *
215alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
216{
217	struct nv50_reg *r;
218	int i;
219
220	if (dst && dst->type == P_TEMP && dst->hw == -1)
221		return dst;
222
223	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
224		if (!pc->r_temp[i]) {
225			r = MALLOC_STRUCT(nv50_reg);
226			ctor_reg(r, P_TEMP, -1, i);
227			pc->r_temp[i] = r;
228			return r;
229		}
230	}
231
232	assert(0);
233	return NULL;
234}
235
236/* Assign the hw of the discarded temporary register src
237 * to the tgsi register dst and free src.
238 */
239static void
240assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
241{
242	assert(src->index == -1 && src->hw != -1);
243
244	if (dst->hw != -1)
245		pc->r_temp[dst->hw] = NULL;
246	pc->r_temp[src->hw] = dst;
247	dst->hw = src->hw;
248
249	FREE(src);
250}
251
252/* release the hardware resource held by r */
253static void
254release_hw(struct nv50_pc *pc, struct nv50_reg *r)
255{
256	assert(r->type == P_TEMP);
257	if (r->hw == -1)
258		return;
259
260	assert(pc->r_temp[r->hw] == r);
261	pc->r_temp[r->hw] = NULL;
262
263	r->acc = 0;
264	if (r->index == -1)
265		FREE(r);
266}
267
268static void
269free_temp(struct nv50_pc *pc, struct nv50_reg *r)
270{
271	if (r->index == -1) {
272		unsigned hw = r->hw;
273
274		FREE(pc->r_temp[hw]);
275		pc->r_temp[hw] = NULL;
276	}
277}
278
279static int
280alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
281{
282	int i;
283
284	if ((idx + 4) >= NV50_SU_MAX_TEMP)
285		return 1;
286
287	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
288	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
289		return alloc_temp4(pc, dst, idx + 4);
290
291	for (i = 0; i < 4; i++) {
292		dst[i] = MALLOC_STRUCT(nv50_reg);
293		ctor_reg(dst[i], P_TEMP, -1, idx + i);
294		pc->r_temp[idx + i] = dst[i];
295	}
296
297	return 0;
298}
299
300static void
301free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
302{
303	int i;
304
305	for (i = 0; i < 4; i++)
306		free_temp(pc, reg[i]);
307}
308
309static struct nv50_reg *
310temp_temp(struct nv50_pc *pc)
311{
312	if (pc->temp_temp_nr >= 16)
313		assert(0);
314
315	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
316	return pc->temp_temp[pc->temp_temp_nr++];
317}
318
319static void
320kill_temp_temp(struct nv50_pc *pc)
321{
322	int i;
323
324	for (i = 0; i < pc->temp_temp_nr; i++)
325		free_temp(pc, pc->temp_temp[i]);
326	pc->temp_temp_nr = 0;
327}
328
329static int
330ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
331{
332	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
333			       (pc->immd_nr + 1) * 4 * sizeof(float));
334	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
335	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
336	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
337	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
338
339	return pc->immd_nr++;
340}
341
342static struct nv50_reg *
343alloc_immd(struct nv50_pc *pc, float f)
344{
345	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
346	unsigned hw;
347
348	for (hw = 0; hw < pc->immd_nr * 4; hw++)
349		if (pc->immd_buf[hw] == f)
350			break;
351
352	if (hw == pc->immd_nr * 4)
353		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
354
355	ctor_reg(r, P_IMMD, -1, hw);
356	return r;
357}
358
359static struct nv50_program_exec *
360exec(struct nv50_pc *pc)
361{
362	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
363
364	e->param.index = -1;
365	return e;
366}
367
368static void
369emit(struct nv50_pc *pc, struct nv50_program_exec *e)
370{
371	struct nv50_program *p = pc->p;
372
373	if (p->exec_tail)
374		p->exec_tail->next = e;
375	if (!p->exec_head)
376		p->exec_head = e;
377	p->exec_tail = e;
378	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
379}
380
381static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
382
383static boolean
384is_long(struct nv50_program_exec *e)
385{
386	if (e->inst[0] & 1)
387		return TRUE;
388	return FALSE;
389}
390
391static boolean
392is_immd(struct nv50_program_exec *e)
393{
394	if (is_long(e) && (e->inst[1] & 3) == 3)
395		return TRUE;
396	return FALSE;
397}
398
399static INLINE void
400set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
401	 struct nv50_program_exec *e)
402{
403	set_long(pc, e);
404	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
405	e->inst[1] |= (pred << 7) | (idx << 12);
406}
407
408static INLINE void
409set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
410	    struct nv50_program_exec *e)
411{
412	set_long(pc, e);
413	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
414	e->inst[1] |= (idx << 4) | (on << 6);
415}
416
417static INLINE void
418set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
419{
420	if (is_long(e))
421		return;
422
423	e->inst[0] |= 1;
424	set_pred(pc, 0xf, 0, e);
425	set_pred_wr(pc, 0, 0, e);
426}
427
428static INLINE void
429set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
430{
431	if (dst->type == P_RESULT) {
432		set_long(pc, e);
433		e->inst[1] |= 0x00000008;
434	}
435
436	alloc_reg(pc, dst);
437	e->inst[0] |= (dst->hw << 2);
438}
439
440static INLINE void
441set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
442{
443	float f = pc->immd_buf[imm->hw];
444	unsigned val = fui(imm->neg ? -f : f);
445
446	set_long(pc, e);
447	/*XXX: can't be predicated - bits overlap.. catch cases where both
448	 *     are required and avoid them. */
449	set_pred(pc, 0, 0, e);
450	set_pred_wr(pc, 0, 0, e);
451
452	e->inst[1] |= 0x00000002 | 0x00000001;
453	e->inst[0] |= (val & 0x3f) << 16;
454	e->inst[1] |= (val >> 6) << 2;
455}
456
457
458#define INTERP_LINEAR		0
459#define INTERP_FLAT			1
460#define INTERP_PERSPECTIVE	2
461#define INTERP_CENTROID		4
462
463/* interpolant index has been stored in dst->rhw */
464static void
465emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
466		unsigned mode)
467{
468	assert(dst->rhw != -1);
469	struct nv50_program_exec *e = exec(pc);
470
471	e->inst[0] |= 0x80000000;
472	set_dst(pc, dst, e);
473	e->inst[0] |= (dst->rhw << 16);
474
475	if (mode & INTERP_FLAT) {
476		e->inst[0] |= (1 << 8);
477	} else {
478		if (mode & INTERP_PERSPECTIVE) {
479			e->inst[0] |= (1 << 25);
480			alloc_reg(pc, iv);
481			e->inst[0] |= (iv->hw << 9);
482		}
483
484		if (mode & INTERP_CENTROID)
485			e->inst[0] |= (1 << 24);
486	}
487
488	emit(pc, e);
489}
490
491static void
492set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
493	 struct nv50_program_exec *e)
494{
495	set_long(pc, e);
496
497	e->param.index = src->hw;
498	e->param.shift = s;
499	e->param.mask = m << (s % 32);
500
501	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
502}
503
504static void
505emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
506{
507	struct nv50_program_exec *e = exec(pc);
508
509	e->inst[0] = 0x10000000;
510	if (!pc->allow32)
511		set_long(pc, e);
512
513	set_dst(pc, dst, e);
514
515	if (!is_long(e) && src->type == P_IMMD) {
516		set_immd(pc, src, e);
517		/*XXX: 32-bit, but steals part of "half" reg space - need to
518		 *     catch and handle this case if/when we do half-regs
519		 */
520	} else
521	if (src->type == P_IMMD || src->type == P_CONST) {
522		set_long(pc, e);
523		set_data(pc, src, 0x7f, 9, e);
524		e->inst[1] |= 0x20000000; /* src0 const? */
525	} else {
526		if (src->type == P_ATTR) {
527			set_long(pc, e);
528			e->inst[1] |= 0x00200000;
529		}
530
531		alloc_reg(pc, src);
532		e->inst[0] |= (src->hw << 9);
533	}
534
535	if (is_long(e) && !is_immd(e)) {
536		e->inst[1] |= 0x04000000; /* 32-bit */
537		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
538		if (!(e->inst[1] & 0x20000000))
539			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
540	} else
541		e->inst[0] |= 0x00008000;
542
543	emit(pc, e);
544}
545
546static INLINE void
547emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
548{
549	struct nv50_reg *imm = alloc_immd(pc, f);
550	emit_mov(pc, dst, imm);
551	FREE(imm);
552}
553
554static boolean
555check_swap_src_0_1(struct nv50_pc *pc,
556		   struct nv50_reg **s0, struct nv50_reg **s1)
557{
558	struct nv50_reg *src0 = *s0, *src1 = *s1;
559
560	if (src0->type == P_CONST) {
561		if (src1->type != P_CONST) {
562			*s0 = src1;
563			*s1 = src0;
564			return TRUE;
565		}
566	} else
567	if (src1->type == P_ATTR) {
568		if (src0->type != P_ATTR) {
569			*s0 = src1;
570			*s1 = src0;
571			return TRUE;
572		}
573	}
574
575	return FALSE;
576}
577
578static void
579set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
580		     struct nv50_program_exec *e)
581{
582	struct nv50_reg *temp;
583
584	if (src->type != P_TEMP) {
585		temp = temp_temp(pc);
586		emit_mov(pc, temp, src);
587		src = temp;
588	}
589
590	alloc_reg(pc, src);
591	e->inst[0] |= (src->hw << 9);
592}
593
594static void
595set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
596{
597	if (src->type == P_ATTR) {
598		set_long(pc, e);
599		e->inst[1] |= 0x00200000;
600	} else
601	if (src->type == P_CONST || src->type == P_IMMD) {
602		struct nv50_reg *temp = temp_temp(pc);
603
604		emit_mov(pc, temp, src);
605		src = temp;
606	}
607
608	alloc_reg(pc, src);
609	e->inst[0] |= (src->hw << 9);
610}
611
612static void
613set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
614{
615	if (src->type == P_ATTR) {
616		struct nv50_reg *temp = temp_temp(pc);
617
618		emit_mov(pc, temp, src);
619		src = temp;
620	} else
621	if (src->type == P_CONST || src->type == P_IMMD) {
622		assert(!(e->inst[0] & 0x00800000));
623		if (e->inst[0] & 0x01000000) {
624			struct nv50_reg *temp = temp_temp(pc);
625
626			emit_mov(pc, temp, src);
627			src = temp;
628		} else {
629			set_data(pc, src, 0x7f, 16, e);
630			e->inst[0] |= 0x00800000;
631		}
632	}
633
634	alloc_reg(pc, src);
635	e->inst[0] |= (src->hw << 16);
636}
637
638static void
639set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
640{
641	set_long(pc, e);
642
643	if (src->type == P_ATTR) {
644		struct nv50_reg *temp = temp_temp(pc);
645
646		emit_mov(pc, temp, src);
647		src = temp;
648	} else
649	if (src->type == P_CONST || src->type == P_IMMD) {
650		assert(!(e->inst[0] & 0x01000000));
651		if (e->inst[0] & 0x00800000) {
652			struct nv50_reg *temp = temp_temp(pc);
653
654			emit_mov(pc, temp, src);
655			src = temp;
656		} else {
657			set_data(pc, src, 0x7f, 32+14, e);
658			e->inst[0] |= 0x01000000;
659		}
660	}
661
662	alloc_reg(pc, src);
663	e->inst[1] |= (src->hw << 14);
664}
665
666static void
667emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
668	 struct nv50_reg *src1)
669{
670	struct nv50_program_exec *e = exec(pc);
671
672	e->inst[0] |= 0xc0000000;
673
674	if (!pc->allow32)
675		set_long(pc, e);
676
677	check_swap_src_0_1(pc, &src0, &src1);
678	set_dst(pc, dst, e);
679	set_src_0(pc, src0, e);
680	if (src1->type == P_IMMD && !is_long(e)) {
681		if (src0->neg)
682			e->inst[0] |= 0x00008000;
683		set_immd(pc, src1, e);
684	} else {
685		set_src_1(pc, src1, e);
686		if (src0->neg ^ src1->neg) {
687			if (is_long(e))
688				e->inst[1] |= 0x08000000;
689			else
690				e->inst[0] |= 0x00008000;
691		}
692	}
693
694	emit(pc, e);
695}
696
697static void
698emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
699	 struct nv50_reg *src0, struct nv50_reg *src1)
700{
701	struct nv50_program_exec *e = exec(pc);
702
703	e->inst[0] |= 0xb0000000;
704
705	check_swap_src_0_1(pc, &src0, &src1);
706
707	if (!pc->allow32 || src0->neg || src1->neg) {
708		set_long(pc, e);
709		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
710	}
711
712	set_dst(pc, dst, e);
713	set_src_0(pc, src0, e);
714	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
715		set_src_2(pc, src1, e);
716	else
717	if (src1->type == P_IMMD)
718		set_immd(pc, src1, e);
719	else
720		set_src_1(pc, src1, e);
721
722	emit(pc, e);
723}
724
725static void
726emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
727	    struct nv50_reg *src0, struct nv50_reg *src1)
728{
729	struct nv50_program_exec *e = exec(pc);
730
731	set_long(pc, e);
732	e->inst[0] |= 0xb0000000;
733	e->inst[1] |= (sub << 29);
734
735	check_swap_src_0_1(pc, &src0, &src1);
736	set_dst(pc, dst, e);
737	set_src_0(pc, src0, e);
738	set_src_1(pc, src1, e);
739
740	emit(pc, e);
741}
742
743static INLINE void
744emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
745	 struct nv50_reg *src1)
746{
747	src1->neg ^= 1;
748	emit_add(pc, dst, src0, src1);
749	src1->neg ^= 1;
750}
751
752static void
753emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
754	 struct nv50_reg *src1, struct nv50_reg *src2)
755{
756	struct nv50_program_exec *e = exec(pc);
757
758	e->inst[0] |= 0xe0000000;
759
760	check_swap_src_0_1(pc, &src0, &src1);
761	set_dst(pc, dst, e);
762	set_src_0(pc, src0, e);
763	set_src_1(pc, src1, e);
764	set_src_2(pc, src2, e);
765
766	if (src0->neg ^ src1->neg)
767		e->inst[1] |= 0x04000000;
768	if (src2->neg)
769		e->inst[1] |= 0x08000000;
770
771	emit(pc, e);
772}
773
774static INLINE void
775emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
776	 struct nv50_reg *src1, struct nv50_reg *src2)
777{
778	src2->neg ^= 1;
779	emit_mad(pc, dst, src0, src1, src2);
780	src2->neg ^= 1;
781}
782
783static void
784emit_flop(struct nv50_pc *pc, unsigned sub,
785	  struct nv50_reg *dst, struct nv50_reg *src)
786{
787	struct nv50_program_exec *e = exec(pc);
788
789	e->inst[0] |= 0x90000000;
790	if (sub) {
791		set_long(pc, e);
792		e->inst[1] |= (sub << 29);
793	}
794
795	set_dst(pc, dst, e);
796
797	if (sub == 0 || sub == 2)
798		set_src_0_restricted(pc, src, e);
799	else
800		set_src_0(pc, src, e);
801
802	emit(pc, e);
803}
804
805static void
806emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
807{
808	struct nv50_program_exec *e = exec(pc);
809
810	e->inst[0] |= 0xb0000000;
811
812	set_dst(pc, dst, e);
813	set_src_0(pc, src, e);
814	set_long(pc, e);
815	e->inst[1] |= (6 << 29) | 0x00004000;
816
817	emit(pc, e);
818}
819
820static void
821emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
822{
823	struct nv50_program_exec *e = exec(pc);
824
825	e->inst[0] |= 0xb0000000;
826
827	set_dst(pc, dst, e);
828	set_src_0(pc, src, e);
829	set_long(pc, e);
830	e->inst[1] |= (6 << 29);
831
832	emit(pc, e);
833}
834
835#define CVTOP_RN	0x01
836#define CVTOP_FLOOR	0x03
837#define CVTOP_CEIL	0x05
838#define CVTOP_TRUNC	0x07
839#define CVTOP_SAT	0x08
840#define CVTOP_ABS	0x10
841
842/* 0x04 == 32 bit dst */
843/* 0x40 == dst is float */
844/* 0x80 == src is float */
845#define CVT_F32_F32 0xc4
846#define CVT_F32_S32 0x44
847#define CVT_F32_U32 0x64
848#define CVT_S32_F32 0x8c
849#define CVT_S32_S32 0x0c
850#define CVT_NEG     0x20
851#define CVT_RI      0x08
852
853static void
854emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
855	 int wp, unsigned cvn, unsigned fmt)
856{
857	struct nv50_program_exec *e;
858
859	e = exec(pc);
860	set_long(pc, e);
861
862	e->inst[0] |= 0xa0000000;
863	e->inst[1] |= 0x00004000; /* 32 bit src */
864	e->inst[1] |= (cvn << 16);
865	e->inst[1] |= (fmt << 24);
866	set_src_0(pc, src, e);
867
868	if (wp >= 0)
869		set_pred_wr(pc, 1, wp, e);
870
871	if (dst)
872		set_dst(pc, dst, e);
873	else {
874		e->inst[0] |= 0x000001fc;
875		e->inst[1] |= 0x00000008;
876	}
877
878	emit(pc, e);
879}
880
881/* nv50 Condition codes:
882 *  0x1 = LT
883 *  0x2 = EQ
884 *  0x3 = LE
885 *  0x4 = GT
886 *  0x5 = NE
887 *  0x6 = GE
888 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
889 *  0x8 = unordered bit (allows NaN)
890 */
891static void
892emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
893	 struct nv50_reg *src0, struct nv50_reg *src1)
894{
895	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
896
897	struct nv50_program_exec *e = exec(pc);
898	struct nv50_reg *rdst;
899
900	assert(ccode < 16);
901	if (check_swap_src_0_1(pc, &src0, &src1))
902		ccode = cc_swapped[ccode & 7] | (ccode & 8);
903
904	rdst = dst;
905	if (dst && dst->type != P_TEMP)
906		dst = alloc_temp(pc, NULL);
907
908	/* set.u32 */
909	set_long(pc, e);
910	e->inst[0] |= 0xb0000000;
911	e->inst[1] |= 0x60000000 | (ccode << 14);
912
913	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
914	 * that doesn't seem to match what the hw actually does
915	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
916	 */
917
918	if (wp >= 0)
919		set_pred_wr(pc, 1, wp, e);
920	if (dst)
921		set_dst(pc, dst, e);
922	else {
923		e->inst[0] |= 0x000001fc;
924		e->inst[1] |= 0x00000008;
925	}
926
927	set_src_0(pc, src0, e);
928	set_src_1(pc, src1, e);
929
930	emit(pc, e);
931	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
932
933	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
934	if (rdst)
935		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
936	if (rdst && rdst != dst)
937		free_temp(pc, dst);
938}
939
940static INLINE unsigned
941map_tgsi_setop_cc(unsigned op)
942{
943	switch (op) {
944	case TGSI_OPCODE_SLT: return 0x1;
945	case TGSI_OPCODE_SGE: return 0x6;
946	case TGSI_OPCODE_SEQ: return 0x2;
947	case TGSI_OPCODE_SGT: return 0x4;
948	case TGSI_OPCODE_SLE: return 0x3;
949	case TGSI_OPCODE_SNE: return 0xd;
950	default:
951		assert(0);
952		return 0;
953	}
954}
955
956static INLINE void
957emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
958{
959	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
960}
961
962static void
963emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
964	 struct nv50_reg *v, struct nv50_reg *e)
965{
966	struct nv50_reg *temp = alloc_temp(pc, NULL);
967
968	emit_flop(pc, 3, temp, v);
969	emit_mul(pc, temp, temp, e);
970	emit_preex2(pc, temp, temp);
971	emit_flop(pc, 6, dst, temp);
972
973	free_temp(pc, temp);
974}
975
976static INLINE void
977emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
978{
979	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
980}
981
982static INLINE void
983emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
984{
985	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
986}
987
988static void
989emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
990	 struct nv50_reg **src)
991{
992	struct nv50_reg *one = alloc_immd(pc, 1.0);
993	struct nv50_reg *zero = alloc_immd(pc, 0.0);
994	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
995	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
996	struct nv50_reg *tmp[4];
997	boolean allow32 = pc->allow32;
998
999	pc->allow32 = FALSE;
1000
1001	if (mask & (3 << 1)) {
1002		tmp[0] = alloc_temp(pc, NULL);
1003		emit_minmax(pc, 4, tmp[0], src[0], zero);
1004	}
1005
1006	if (mask & (1 << 2)) {
1007		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1008
1009		tmp[1] = temp_temp(pc);
1010		emit_minmax(pc, 4, tmp[1], src[1], zero);
1011
1012		tmp[3] = temp_temp(pc);
1013		emit_minmax(pc, 4, tmp[3], src[3], neg128);
1014		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1015
1016		emit_pow(pc, dst[2], tmp[1], tmp[3]);
1017		emit_mov(pc, dst[2], zero);
1018		set_pred(pc, 3, 0, pc->p->exec_tail);
1019	}
1020
1021	if (mask & (1 << 1))
1022		assimilate_temp(pc, dst[1], tmp[0]);
1023	else
1024	if (mask & (1 << 2))
1025		free_temp(pc, tmp[0]);
1026
1027	pc->allow32 = allow32;
1028
1029	/* do this last, in case src[i,j] == dst[0,3] */
1030	if (mask & (1 << 0))
1031		emit_mov(pc, dst[0], one);
1032
1033	if (mask & (1 << 3))
1034		emit_mov(pc, dst[3], one);
1035
1036	FREE(pos128);
1037	FREE(neg128);
1038	FREE(zero);
1039	FREE(one);
1040}
1041
1042static INLINE void
1043emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1044{
1045	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1046}
1047
1048static void
1049emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1050{
1051	struct nv50_program_exec *e;
1052	const int r_pred = 1;
1053
1054	/* Sets predicate reg ? */
1055	e = exec(pc);
1056	e->inst[0] = 0xa00001fd;
1057	e->inst[1] = 0xc4014788;
1058	set_src_0(pc, src, e);
1059	set_pred_wr(pc, 1, r_pred, e);
1060	if (src->neg)
1061		e->inst[1] |= 0x20000000;
1062	emit(pc, e);
1063
1064	/* This is probably KILP */
1065	e = exec(pc);
1066	e->inst[0] = 0x000001fe;
1067	set_long(pc, e);
1068	set_pred(pc, 1 /* LT? */, r_pred, e);
1069	emit(pc, e);
1070}
1071
1072static void
1073emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1074	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1075{
1076	struct nv50_reg *temp, *t[4];
1077	struct nv50_program_exec *e;
1078
1079	unsigned c, mode, dim;
1080
1081	switch (type) {
1082	case TGSI_TEXTURE_1D:
1083		dim = 1;
1084		break;
1085	case TGSI_TEXTURE_UNKNOWN:
1086	case TGSI_TEXTURE_2D:
1087	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1088	case TGSI_TEXTURE_RECT:
1089		dim = 2;
1090		break;
1091	case TGSI_TEXTURE_3D:
1092	case TGSI_TEXTURE_CUBE:
1093	case TGSI_TEXTURE_SHADOW2D:
1094	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1095		dim = 3;
1096		break;
1097	default:
1098		assert(0);
1099		break;
1100	}
1101
1102	/* some cards need t[0]'s hw index to be a multiple of 4 */
1103	alloc_temp4(pc, t, 0);
1104
1105	if (proj) {
1106		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1107			mode = pc->interp_mode[src[0]->index];
1108
1109			t[3]->rhw = src[3]->rhw;
1110			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1111			emit_flop(pc, 0, t[3], t[3]);
1112
1113			for (c = 0; c < dim; c++) {
1114				t[c]->rhw = src[c]->rhw;
1115				emit_interp(pc, t[c], t[3],
1116					    (mode | INTERP_PERSPECTIVE));
1117			}
1118		} else {
1119			emit_flop(pc, 0, t[3], src[3]);
1120			for (c = 0; c < dim; c++)
1121				emit_mul(pc, t[c], src[c], t[3]);
1122
1123			/* XXX: for some reason the blob sometimes uses MAD:
1124			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1125			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1126			 */
1127		}
1128	} else {
1129		if (type == TGSI_TEXTURE_CUBE) {
1130			temp = temp_temp(pc);
1131			emit_minmax(pc, 4, temp, src[0], src[1]);
1132			emit_minmax(pc, 4, temp, temp, src[2]);
1133			emit_flop(pc, 0, temp, temp);
1134			for (c = 0; c < 3; c++)
1135				emit_mul(pc, t[c], src[c], temp);
1136		} else {
1137			for (c = 0; c < dim; c++)
1138				emit_mov(pc, t[c], src[c]);
1139		}
1140	}
1141
1142	e = exec(pc);
1143	set_long(pc, e);
1144	e->inst[0] |= 0xf0000000;
1145	e->inst[1] |= 0x00000004;
1146	set_dst(pc, t[0], e);
1147	e->inst[0] |= (unit << 9);
1148
1149	if (dim == 2)
1150		e->inst[0] |= 0x00400000;
1151	else
1152	if (dim == 3)
1153		e->inst[0] |= 0x00800000;
1154
1155	e->inst[0] |= (mask & 0x3) << 25;
1156	e->inst[1] |= (mask & 0xc) << 12;
1157
1158	emit(pc, e);
1159
1160#if 1
1161	c = 0;
1162	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1163	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1164	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1165	if (mask & 8) emit_mov(pc, dst[3], t[c]);
1166
1167	free_temp4(pc, t);
1168#else
1169	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1170	 * the texture coordinates, not the fetched values: latency ? */
1171
1172	for (c = 0; c < 4; c++) {
1173		if (mask & (1 << c))
1174			assimilate_temp(pc, dst[c], t[c]);
1175		else
1176			free_temp(pc, t[c]);
1177	}
1178#endif
1179}
1180
1181static void
1182emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1183	    struct nv50_program_exec **join)
1184{
1185	struct nv50_program_exec *e = exec(pc);
1186
1187	if (join) {
1188		set_long(pc, e);
1189		e->inst[0] |= 0xa0000002;
1190		emit(pc, e);
1191		*join = e;
1192		e = exec(pc);
1193	}
1194
1195	set_long(pc, e);
1196	e->inst[0] |= 0x10000002;
1197	if (pred >= 0)
1198		set_pred(pc, cc, pred, e);
1199	emit(pc, e);
1200}
1201
1202static void
1203emit_nop(struct nv50_pc *pc)
1204{
1205	struct nv50_program_exec *e = exec(pc);
1206
1207	e->inst[0] = 0xf0000000;
1208	set_long(pc, e);
1209	e->inst[1] = 0xe0000000;
1210	emit(pc, e);
1211}
1212
1213static void
1214emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1215{
1216	struct nv50_program_exec *e = exec(pc);
1217
1218	assert(src->type == P_TEMP);
1219
1220	e->inst[0] = 0xc0140000;
1221	e->inst[1] = 0x89800000;
1222	set_long(pc, e);
1223	set_dst(pc, dst, e);
1224	set_src_0(pc, src, e);
1225	set_src_2(pc, src, e);
1226
1227	emit(pc, e);
1228}
1229
1230static void
1231emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1232{
1233	struct nv50_program_exec *e = exec(pc);
1234
1235	assert(src->type == P_TEMP);
1236
1237	if (!src->neg) /* ! double negation */
1238		emit_neg(pc, src, src);
1239
1240	e->inst[0] = 0xc0150000;
1241	e->inst[1] = 0x8a400000;
1242	set_long(pc, e);
1243	set_dst(pc, dst, e);
1244	set_src_0(pc, src, e);
1245	set_src_2(pc, src, e);
1246
1247	emit(pc, e);
1248}
1249
1250static void
1251convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1252{
1253	unsigned q = 0, m = ~0;
1254
1255	assert(!is_long(e));
1256
1257	switch (e->inst[0] >> 28) {
1258	case 0x1:
1259		/* MOV */
1260		q = 0x0403c000;
1261		m = 0xffff7fff;
1262		break;
1263	case 0x8:
1264		/* INTERP (move centroid, perspective and flat bits) */
1265		m = ~0x03000100;
1266		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1267		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1268		break;
1269	case 0x9:
1270		/* RCP */
1271		break;
1272	case 0xB:
1273		/* ADD */
1274		m = ~(127 << 16);
1275		q = ((e->inst[0] & (~m)) >> 2);
1276		break;
1277	case 0xC:
1278		/* MUL */
1279		m = ~0x00008000;
1280		q = ((e->inst[0] & (~m)) << 12);
1281		break;
1282	case 0xE:
1283		/* MAD (if src2 == dst) */
1284		q = ((e->inst[0] & 0x1fc) << 12);
1285		break;
1286	default:
1287		assert(0);
1288		break;
1289	}
1290
1291	set_long(pc, e);
1292	pc->p->exec_size++;
1293
1294	e->inst[0] &= m;
1295	e->inst[1] |= q;
1296}
1297
1298/* Some operations support an optional negation flag. */
1299static boolean
1300negate_supported(const struct tgsi_full_instruction *insn, int i)
1301{
1302	int s;
1303
1304	switch (insn->Instruction.Opcode) {
1305	case TGSI_OPCODE_DDY:
1306	case TGSI_OPCODE_DP3:
1307	case TGSI_OPCODE_DP4:
1308	case TGSI_OPCODE_MUL:
1309	case TGSI_OPCODE_KIL:
1310	case TGSI_OPCODE_ADD:
1311	case TGSI_OPCODE_SUB:
1312	case TGSI_OPCODE_MAD:
1313		break;
1314	case TGSI_OPCODE_POW:
1315		if (i == 1)
1316			break;
1317		return FALSE;
1318	default:
1319		return FALSE;
1320	}
1321
1322	/* Watch out for possible multiple uses of an nv50_reg, we
1323	 * can't use nv50_reg::neg in these cases.
1324	 */
1325	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
1326		if (s == i)
1327			continue;
1328		if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
1329		     insn->FullSrcRegisters[i].SrcRegister.Index) &&
1330		    (insn->FullSrcRegisters[s].SrcRegister.File ==
1331		     insn->FullSrcRegisters[i].SrcRegister.File))
1332			return FALSE;
1333	}
1334
1335	return TRUE;
1336}
1337
1338/* Return a read mask for source registers deduced from opcode & write mask. */
1339static unsigned
1340nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1341{
1342	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1343
1344	switch (insn->Instruction.Opcode) {
1345	case TGSI_OPCODE_COS:
1346	case TGSI_OPCODE_SIN:
1347		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1348	case TGSI_OPCODE_DP3:
1349		return 0x7;
1350	case TGSI_OPCODE_DP4:
1351	case TGSI_OPCODE_DPH:
1352	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1353		return 0xf;
1354	case TGSI_OPCODE_DST:
1355		return mask & (c ? 0xa : 0x6);
1356	case TGSI_OPCODE_EX2:
1357	case TGSI_OPCODE_LG2:
1358	case TGSI_OPCODE_POW:
1359	case TGSI_OPCODE_RCP:
1360	case TGSI_OPCODE_RSQ:
1361	case TGSI_OPCODE_SCS:
1362		return 0x1;
1363	case TGSI_OPCODE_LIT:
1364		return 0xb;
1365	case TGSI_OPCODE_TEX:
1366	case TGSI_OPCODE_TXP:
1367	{
1368		const struct tgsi_instruction_ext_texture *tex;
1369
1370		assert(insn->Instruction.Extended);
1371		tex = &insn->InstructionExtTexture;
1372
1373		mask = 0x7;
1374		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1375			mask |= 0x8;
1376
1377		switch (tex->Texture) {
1378		case TGSI_TEXTURE_1D:
1379			mask &= 0x9;
1380			break;
1381		case TGSI_TEXTURE_2D:
1382			mask &= 0xb;
1383			break;
1384		default:
1385			break;
1386		}
1387	}
1388		return mask;
1389	case TGSI_OPCODE_XPD:
1390		x = 0;
1391		if (mask & 1) x |= 0x6;
1392		if (mask & 2) x |= 0x5;
1393		if (mask & 4) x |= 0x3;
1394		return x;
1395	default:
1396		break;
1397	}
1398
1399	return mask;
1400}
1401
1402static struct nv50_reg *
1403tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1404{
1405	switch (dst->DstRegister.File) {
1406	case TGSI_FILE_TEMPORARY:
1407		return &pc->temp[dst->DstRegister.Index * 4 + c];
1408	case TGSI_FILE_OUTPUT:
1409		return &pc->result[dst->DstRegister.Index * 4 + c];
1410	case TGSI_FILE_NULL:
1411		return NULL;
1412	default:
1413		break;
1414	}
1415
1416	return NULL;
1417}
1418
1419static struct nv50_reg *
1420tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1421	 boolean neg)
1422{
1423	struct nv50_reg *r = NULL;
1424	struct nv50_reg *temp;
1425	unsigned sgn, c;
1426
1427	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1428
1429	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1430	switch (c) {
1431	case TGSI_EXTSWIZZLE_X:
1432	case TGSI_EXTSWIZZLE_Y:
1433	case TGSI_EXTSWIZZLE_Z:
1434	case TGSI_EXTSWIZZLE_W:
1435		switch (src->SrcRegister.File) {
1436		case TGSI_FILE_INPUT:
1437			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1438			break;
1439		case TGSI_FILE_TEMPORARY:
1440			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1441			break;
1442		case TGSI_FILE_CONSTANT:
1443			r = &pc->param[src->SrcRegister.Index * 4 + c];
1444			break;
1445		case TGSI_FILE_IMMEDIATE:
1446			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1447			break;
1448		case TGSI_FILE_SAMPLER:
1449			break;
1450		default:
1451			assert(0);
1452			break;
1453		}
1454		break;
1455	case TGSI_EXTSWIZZLE_ZERO:
1456		r = alloc_immd(pc, 0.0);
1457		return r;
1458	case TGSI_EXTSWIZZLE_ONE:
1459		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1460			return alloc_immd(pc, -1.0);
1461		return alloc_immd(pc, 1.0);
1462	default:
1463		assert(0);
1464		break;
1465	}
1466
1467	switch (sgn) {
1468	case TGSI_UTIL_SIGN_KEEP:
1469		break;
1470	case TGSI_UTIL_SIGN_CLEAR:
1471		temp = temp_temp(pc);
1472		emit_abs(pc, temp, r);
1473		r = temp;
1474		break;
1475	case TGSI_UTIL_SIGN_TOGGLE:
1476		if (neg)
1477			r->neg = 1;
1478		else {
1479			temp = temp_temp(pc);
1480			emit_neg(pc, temp, r);
1481			r = temp;
1482		}
1483		break;
1484	case TGSI_UTIL_SIGN_SET:
1485		temp = temp_temp(pc);
1486		emit_abs(pc, temp, r);
1487		if (neg)
1488			temp->neg = 1;
1489		else
1490			emit_neg(pc, temp, temp);
1491		r = temp;
1492		break;
1493	default:
1494		assert(0);
1495		break;
1496	}
1497
1498	return r;
1499}
1500
1501/* return TRUE for ops that produce only a single result */
1502static boolean
1503is_scalar_op(unsigned op)
1504{
1505	switch (op) {
1506	case TGSI_OPCODE_COS:
1507	case TGSI_OPCODE_DP2:
1508	case TGSI_OPCODE_DP3:
1509	case TGSI_OPCODE_DP4:
1510	case TGSI_OPCODE_DPH:
1511	case TGSI_OPCODE_EX2:
1512	case TGSI_OPCODE_LG2:
1513	case TGSI_OPCODE_POW:
1514	case TGSI_OPCODE_RCP:
1515	case TGSI_OPCODE_RSQ:
1516	case TGSI_OPCODE_SIN:
1517		/*
1518	case TGSI_OPCODE_KIL:
1519	case TGSI_OPCODE_LIT:
1520	case TGSI_OPCODE_SCS:
1521		*/
1522		return TRUE;
1523	default:
1524		return FALSE;
1525	}
1526}
1527
1528/* Returns a bitmask indicating which dst components depend
1529 * on source s, component c (reverse of nv50_tgsi_src_mask).
1530 */
1531static unsigned
1532nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1533{
1534	if (is_scalar_op(op))
1535		return 0x1;
1536
1537	switch (op) {
1538	case TGSI_OPCODE_DST:
1539		return (1 << c) & (s ? 0xa : 0x6);
1540	case TGSI_OPCODE_XPD:
1541		switch (c) {
1542		case 0: return 0x6;
1543		case 1: return 0x5;
1544		case 2: return 0x3;
1545		case 3: return 0x0;
1546		default:
1547			assert(0);
1548			return 0x0;
1549		}
1550	case TGSI_OPCODE_LIT:
1551	case TGSI_OPCODE_SCS:
1552	case TGSI_OPCODE_TEX:
1553	case TGSI_OPCODE_TXP:
1554		/* these take care of dangerous swizzles themselves */
1555		return 0x0;
1556	case TGSI_OPCODE_IF:
1557	case TGSI_OPCODE_KIL:
1558		/* don't call this function for these ops */
1559		assert(0);
1560		return 0;
1561	default:
1562		/* linear vector instruction */
1563		return (1 << c);
1564	}
1565}
1566
1567static INLINE boolean
1568has_pred(struct nv50_program_exec *e, unsigned cc)
1569{
1570	if (!is_long(e) || is_immd(e))
1571		return FALSE;
1572	return ((e->inst[1] & 0x780) == (cc << 7));
1573}
1574
1575/* on ENDIF see if we can do "@p0.neu single_op" instead of:
1576 *        join_at ENDIF
1577 *        @p0.eq bra ENDIF
1578 *        single_op
1579 * ENDIF: nop.join
1580 */
1581static boolean
1582nv50_kill_branch(struct nv50_pc *pc)
1583{
1584	int lvl = pc->if_lvl;
1585
1586	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
1587		return FALSE;
1588
1589	/* if ccode == 'true', the BRA is from an ELSE and the predicate
1590	 * reg may no longer be valid, since we currently always use $p0
1591	 */
1592	if (has_pred(pc->if_insn[lvl], 0xf))
1593		return FALSE;
1594	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
1595
1596	/* We'll use the exec allocated for JOIN_AT (as we can't easily
1597	 * update prev's next); if exec_tail is BRK, update the pointer.
1598	 */
1599	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
1600		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
1601
1602	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
1603
1604	*pc->br_join[lvl] = *pc->p->exec_tail;
1605
1606	FREE(pc->if_insn[lvl]);
1607	FREE(pc->p->exec_tail);
1608
1609	pc->p->exec_tail = pc->br_join[lvl];
1610	pc->p->exec_tail->next = NULL;
1611	set_pred(pc, 0xd, 0, pc->p->exec_tail);
1612
1613	return TRUE;
1614}
1615
1616static boolean
1617nv50_program_tx_insn(struct nv50_pc *pc,
1618		     const struct tgsi_full_instruction *inst)
1619{
1620	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1621	unsigned mask, sat, unit;
1622	int i, c;
1623
1624	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1625	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1626
1627	memset(src, 0, sizeof(src));
1628
1629	for (c = 0; c < 4; c++) {
1630		if ((mask & (1 << c)) && !pc->r_dst[c])
1631			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1632		else
1633			dst[c] = pc->r_dst[c];
1634		rdst[c] = dst[c];
1635	}
1636
1637	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1638		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1639		unsigned src_mask;
1640		boolean neg_supp;
1641
1642		src_mask = nv50_tgsi_src_mask(inst, i);
1643		neg_supp = negate_supported(inst, i);
1644
1645		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1646			unit = fs->SrcRegister.Index;
1647
1648		for (c = 0; c < 4; c++)
1649			if (src_mask & (1 << c))
1650				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1651	}
1652
1653	brdc = temp = pc->r_brdc;
1654	if (brdc && brdc->type != P_TEMP) {
1655		temp = temp_temp(pc);
1656		if (sat)
1657			brdc = temp;
1658	} else
1659	if (sat) {
1660		for (c = 0; c < 4; c++) {
1661			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1662				continue;
1663			rdst[c] = dst[c];
1664			dst[c] = temp_temp(pc);
1665		}
1666	}
1667
1668	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1669
1670	switch (inst->Instruction.Opcode) {
1671	case TGSI_OPCODE_ABS:
1672		for (c = 0; c < 4; c++) {
1673			if (!(mask & (1 << c)))
1674				continue;
1675			emit_abs(pc, dst[c], src[0][c]);
1676		}
1677		break;
1678	case TGSI_OPCODE_ADD:
1679		for (c = 0; c < 4; c++) {
1680			if (!(mask & (1 << c)))
1681				continue;
1682			emit_add(pc, dst[c], src[0][c], src[1][c]);
1683		}
1684		break;
1685	case TGSI_OPCODE_BGNLOOP:
1686		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
1687		break;
1688	case TGSI_OPCODE_BRK:
1689		emit_branch(pc, -1, 0, NULL);
1690		assert(pc->loop_lvl > 0);
1691		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
1692		break;
1693	case TGSI_OPCODE_CEIL:
1694		for (c = 0; c < 4; c++) {
1695			if (!(mask & (1 << c)))
1696				continue;
1697			emit_cvt(pc, dst[c], src[0][c], -1,
1698				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
1699		}
1700		break;
1701	case TGSI_OPCODE_CMP:
1702		pc->allow32 = FALSE;
1703		for (c = 0; c < 4; c++) {
1704			if (!(mask & (1 << c)))
1705				continue;
1706			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
1707			emit_mov(pc, dst[c], src[1][c]);
1708			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
1709			emit_mov(pc, dst[c], src[2][c]);
1710			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
1711		}
1712		break;
1713	case TGSI_OPCODE_COS:
1714		if (mask & 8) {
1715			emit_precossin(pc, temp, src[0][3]);
1716			emit_flop(pc, 5, dst[3], temp);
1717			if (!(mask &= 7))
1718				break;
1719			if (temp == dst[3])
1720				temp = brdc = temp_temp(pc);
1721		}
1722		emit_precossin(pc, temp, src[0][0]);
1723		emit_flop(pc, 5, brdc, temp);
1724		break;
1725	case TGSI_OPCODE_DDX:
1726		for (c = 0; c < 4; c++) {
1727			if (!(mask & (1 << c)))
1728				continue;
1729			emit_ddx(pc, dst[c], src[0][c]);
1730		}
1731		break;
1732	case TGSI_OPCODE_DDY:
1733		for (c = 0; c < 4; c++) {
1734			if (!(mask & (1 << c)))
1735				continue;
1736			emit_ddy(pc, dst[c], src[0][c]);
1737		}
1738		break;
1739	case TGSI_OPCODE_DP3:
1740		emit_mul(pc, temp, src[0][0], src[1][0]);
1741		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1742		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1743		break;
1744	case TGSI_OPCODE_DP4:
1745		emit_mul(pc, temp, src[0][0], src[1][0]);
1746		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1747		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1748		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1749		break;
1750	case TGSI_OPCODE_DPH:
1751		emit_mul(pc, temp, src[0][0], src[1][0]);
1752		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1753		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1754		emit_add(pc, brdc, src[1][3], temp);
1755		break;
1756	case TGSI_OPCODE_DST:
1757		if (mask & (1 << 1))
1758			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1759		if (mask & (1 << 2))
1760			emit_mov(pc, dst[2], src[0][2]);
1761		if (mask & (1 << 3))
1762			emit_mov(pc, dst[3], src[1][3]);
1763		if (mask & (1 << 0))
1764			emit_mov_immdval(pc, dst[0], 1.0f);
1765		break;
1766	case TGSI_OPCODE_ELSE:
1767		emit_branch(pc, -1, 0, NULL);
1768		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1769		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1770		break;
1771	case TGSI_OPCODE_ENDIF:
1772		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1773
1774		/* try to replace branch over 1 insn with a predicated insn */
1775		if (nv50_kill_branch(pc) == TRUE)
1776			break;
1777
1778		if (pc->br_join[pc->if_lvl]) {
1779			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
1780			pc->br_join[pc->if_lvl] = NULL;
1781		}
1782		/* emit a NOP as join point, we could set it on the next
1783		 * one, but would have to make sure it is long and !immd
1784		 */
1785		emit_nop(pc);
1786		pc->p->exec_tail->inst[1] |= 2;
1787		break;
1788	case TGSI_OPCODE_ENDLOOP:
1789		emit_branch(pc, -1, 0, NULL);
1790		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
1791		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
1792		break;
1793	case TGSI_OPCODE_EX2:
1794		emit_preex2(pc, temp, src[0][0]);
1795		emit_flop(pc, 6, brdc, temp);
1796		break;
1797	case TGSI_OPCODE_FLR:
1798		for (c = 0; c < 4; c++) {
1799			if (!(mask & (1 << c)))
1800				continue;
1801			emit_flr(pc, dst[c], src[0][c]);
1802		}
1803		break;
1804	case TGSI_OPCODE_FRC:
1805		temp = temp_temp(pc);
1806		for (c = 0; c < 4; c++) {
1807			if (!(mask & (1 << c)))
1808				continue;
1809			emit_flr(pc, temp, src[0][c]);
1810			emit_sub(pc, dst[c], src[0][c], temp);
1811		}
1812		break;
1813	case TGSI_OPCODE_IF:
1814		/* emitting a join_at may not be necessary */
1815		assert(pc->if_lvl < MAX_IF_DEPTH);
1816		set_pred_wr(pc, 1, 0, pc->if_cond);
1817		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
1818		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1819		break;
1820	case TGSI_OPCODE_KIL:
1821		emit_kil(pc, src[0][0]);
1822		emit_kil(pc, src[0][1]);
1823		emit_kil(pc, src[0][2]);
1824		emit_kil(pc, src[0][3]);
1825		break;
1826	case TGSI_OPCODE_LIT:
1827		emit_lit(pc, &dst[0], mask, &src[0][0]);
1828		break;
1829	case TGSI_OPCODE_LG2:
1830		emit_flop(pc, 3, brdc, src[0][0]);
1831		break;
1832	case TGSI_OPCODE_LRP:
1833		temp = temp_temp(pc);
1834		for (c = 0; c < 4; c++) {
1835			if (!(mask & (1 << c)))
1836				continue;
1837			emit_sub(pc, temp, src[1][c], src[2][c]);
1838			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1839		}
1840		break;
1841	case TGSI_OPCODE_MAD:
1842		for (c = 0; c < 4; c++) {
1843			if (!(mask & (1 << c)))
1844				continue;
1845			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1846		}
1847		break;
1848	case TGSI_OPCODE_MAX:
1849		for (c = 0; c < 4; c++) {
1850			if (!(mask & (1 << c)))
1851				continue;
1852			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1853		}
1854		break;
1855	case TGSI_OPCODE_MIN:
1856		for (c = 0; c < 4; c++) {
1857			if (!(mask & (1 << c)))
1858				continue;
1859			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1860		}
1861		break;
1862	case TGSI_OPCODE_MOV:
1863	case TGSI_OPCODE_SWZ:
1864		for (c = 0; c < 4; c++) {
1865			if (!(mask & (1 << c)))
1866				continue;
1867			emit_mov(pc, dst[c], src[0][c]);
1868		}
1869		break;
1870	case TGSI_OPCODE_MUL:
1871		for (c = 0; c < 4; c++) {
1872			if (!(mask & (1 << c)))
1873				continue;
1874			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1875		}
1876		break;
1877	case TGSI_OPCODE_POW:
1878		emit_pow(pc, brdc, src[0][0], src[1][0]);
1879		break;
1880	case TGSI_OPCODE_RCP:
1881		emit_flop(pc, 0, brdc, src[0][0]);
1882		break;
1883	case TGSI_OPCODE_RSQ:
1884		emit_flop(pc, 2, brdc, src[0][0]);
1885		break;
1886	case TGSI_OPCODE_SCS:
1887		temp = temp_temp(pc);
1888		if (mask & 3)
1889			emit_precossin(pc, temp, src[0][0]);
1890		if (mask & (1 << 0))
1891			emit_flop(pc, 5, dst[0], temp);
1892		if (mask & (1 << 1))
1893			emit_flop(pc, 4, dst[1], temp);
1894		if (mask & (1 << 2))
1895			emit_mov_immdval(pc, dst[2], 0.0);
1896		if (mask & (1 << 3))
1897			emit_mov_immdval(pc, dst[3], 1.0);
1898		break;
1899	case TGSI_OPCODE_SIN:
1900		if (mask & 8) {
1901			emit_precossin(pc, temp, src[0][3]);
1902			emit_flop(pc, 4, dst[3], temp);
1903			if (!(mask &= 7))
1904				break;
1905			if (temp == dst[3])
1906				temp = brdc = temp_temp(pc);
1907		}
1908		emit_precossin(pc, temp, src[0][0]);
1909		emit_flop(pc, 4, brdc, temp);
1910		break;
1911	case TGSI_OPCODE_SLT:
1912	case TGSI_OPCODE_SGE:
1913	case TGSI_OPCODE_SEQ:
1914	case TGSI_OPCODE_SGT:
1915	case TGSI_OPCODE_SLE:
1916	case TGSI_OPCODE_SNE:
1917		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1918		for (c = 0; c < 4; c++) {
1919			if (!(mask & (1 << c)))
1920				continue;
1921			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1922		}
1923		break;
1924	case TGSI_OPCODE_SUB:
1925		for (c = 0; c < 4; c++) {
1926			if (!(mask & (1 << c)))
1927				continue;
1928			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1929		}
1930		break;
1931	case TGSI_OPCODE_TEX:
1932		emit_tex(pc, dst, mask, src[0], unit,
1933			 inst->InstructionExtTexture.Texture, FALSE);
1934		break;
1935	case TGSI_OPCODE_TXP:
1936		emit_tex(pc, dst, mask, src[0], unit,
1937			 inst->InstructionExtTexture.Texture, TRUE);
1938		break;
1939	case TGSI_OPCODE_TRUNC:
1940		for (c = 0; c < 4; c++) {
1941			if (!(mask & (1 << c)))
1942				continue;
1943			emit_cvt(pc, dst[c], src[0][c], -1,
1944				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
1945		}
1946		break;
1947	case TGSI_OPCODE_XPD:
1948		temp = temp_temp(pc);
1949		if (mask & (1 << 0)) {
1950			emit_mul(pc, temp, src[0][2], src[1][1]);
1951			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1952		}
1953		if (mask & (1 << 1)) {
1954			emit_mul(pc, temp, src[0][0], src[1][2]);
1955			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1956		}
1957		if (mask & (1 << 2)) {
1958			emit_mul(pc, temp, src[0][1], src[1][0]);
1959			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1960		}
1961		if (mask & (1 << 3))
1962			emit_mov_immdval(pc, dst[3], 1.0);
1963		break;
1964	case TGSI_OPCODE_END:
1965		break;
1966	default:
1967		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1968		return FALSE;
1969	}
1970
1971	if (brdc) {
1972		if (sat)
1973			emit_sat(pc, brdc, brdc);
1974		for (c = 0; c < 4; c++)
1975			if ((mask & (1 << c)) && dst[c] != brdc)
1976				emit_mov(pc, dst[c], brdc);
1977	} else
1978	if (sat) {
1979		for (c = 0; c < 4; c++) {
1980			if (!(mask & (1 << c)))
1981				continue;
1982			/* in this case we saturate later */
1983			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1984				continue;
1985			emit_sat(pc, rdst[c], dst[c]);
1986		}
1987	}
1988
1989	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1990		for (c = 0; c < 4; c++) {
1991			if (!src[i][c])
1992				continue;
1993			src[i][c]->neg = 0;
1994			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1995				FREE(src[i][c]);
1996		}
1997	}
1998
1999	kill_temp_temp(pc);
2000	return TRUE;
2001}
2002
2003static void
2004prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2005{
2006	struct nv50_reg *reg = NULL;
2007	const struct tgsi_full_src_register *src;
2008	const struct tgsi_dst_register *dst;
2009	unsigned i, c, k, mask;
2010
2011	dst = &insn->FullDstRegisters[0].DstRegister;
2012	mask = dst->WriteMask;
2013
2014        if (dst->File == TGSI_FILE_TEMPORARY)
2015                reg = pc->temp;
2016        else
2017        if (dst->File == TGSI_FILE_OUTPUT)
2018                reg = pc->result;
2019
2020	if (reg) {
2021		for (c = 0; c < 4; c++) {
2022			if (!(mask & (1 << c)))
2023				continue;
2024			reg[dst->Index * 4 + c].acc = pc->insn_nr;
2025		}
2026	}
2027
2028	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2029		src = &insn->FullSrcRegisters[i];
2030
2031		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
2032			reg = pc->temp;
2033		else
2034		if (src->SrcRegister.File == TGSI_FILE_INPUT)
2035			reg = pc->attr;
2036		else
2037			continue;
2038
2039		mask = nv50_tgsi_src_mask(insn, i);
2040
2041		for (c = 0; c < 4; c++) {
2042			if (!(mask & (1 << c)))
2043				continue;
2044			k = tgsi_util_get_full_src_register_extswizzle(src, c);
2045
2046			if (k > TGSI_EXTSWIZZLE_W)
2047				continue;
2048
2049			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
2050		}
2051	}
2052}
2053
2054/* Returns a bitmask indicating which dst components need to be
2055 * written to temporaries first to avoid 'corrupting' sources.
2056 *
2057 * m[i]   (out) indicate component to write in the i-th position
2058 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2059 */
2060static unsigned
2061nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2062{
2063	unsigned i, c, x, unsafe;
2064
2065	for (c = 0; c < 4; c++)
2066		m[c] = c;
2067
2068	/* Swap as long as a dst component written earlier is depended on
2069	 * by one written later, but the next one isn't depended on by it.
2070	 */
2071	for (c = 0; c < 3; c++) {
2072		if (rdep[m[c + 1]] & (1 << m[c]))
2073			continue; /* if next one is depended on by us */
2074		for (i = c + 1; i < 4; i++)
2075			/* if we are depended on by a later one */
2076			if (rdep[m[c]] & (1 << m[i]))
2077				break;
2078		if (i == 4)
2079			continue;
2080		/* now, swap */
2081		x = m[c];
2082		m[c] = m[c + 1];
2083		m[c + 1] = x;
2084
2085		/* restart */
2086		c = 0;
2087	}
2088
2089	/* mark dependencies that could not be resolved by reordering */
2090	for (i = 0; i < 3; ++i)
2091		for (c = i + 1; c < 4; ++c)
2092			if (rdep[m[i]] & (1 << m[c]))
2093				unsafe |= (1 << i);
2094
2095	/* NOTE: $unsafe is with respect to order, not component */
2096	return unsafe;
2097}
2098
2099/* Select a suitable dst register for broadcasting scalar results,
2100 * or return NULL if we have to allocate an extra TEMP.
2101 *
2102 * If e.g. only 1 component is written, we may also emit the final
2103 * result to a write-only register.
2104 */
2105static struct nv50_reg *
2106tgsi_broadcast_dst(struct nv50_pc *pc,
2107		   const struct tgsi_full_dst_register *fd, unsigned mask)
2108{
2109	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
2110		int c = ffs(~mask & fd->DstRegister.WriteMask);
2111		if (c)
2112			return tgsi_dst(pc, c - 1, fd);
2113	} else {
2114		int c = ffs(fd->DstRegister.WriteMask) - 1;
2115		if ((1 << c) == fd->DstRegister.WriteMask)
2116			return tgsi_dst(pc, c, fd);
2117	}
2118
2119	return NULL;
2120}
2121
2122/* Scan source swizzles and return a bitmask indicating dst regs that
2123 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2124 */
2125static unsigned
2126nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2127		       unsigned rdep[4])
2128{
2129	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
2130	const struct tgsi_full_src_register *fs;
2131	unsigned i, deqs = 0;
2132
2133	for (i = 0; i < 4; ++i)
2134		rdep[i] = 0;
2135
2136	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2137		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2138		boolean neg_supp = negate_supported(insn, i);
2139
2140		fs = &insn->FullSrcRegisters[i];
2141		if (fs->SrcRegister.File != fd->DstRegister.File ||
2142		    fs->SrcRegister.Index != fd->DstRegister.Index)
2143			continue;
2144
2145		for (chn = 0; chn < 4; ++chn) {
2146			unsigned s, c;
2147
2148			if (!(mask & (1 << chn))) /* src is not read */
2149				continue;
2150			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
2151			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2152
2153			if (c > TGSI_EXTSWIZZLE_W ||
2154			    !(fd->DstRegister.WriteMask & (1 << c)))
2155				continue;
2156
2157			/* no danger if src is copied to TEMP first */
2158			if ((s != TGSI_UTIL_SIGN_KEEP) &&
2159			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2160				continue;
2161
2162			rdep[c] |= nv50_tgsi_dst_revdep(
2163				insn->Instruction.Opcode, i, chn);
2164			deqs |= (1 << c);
2165		}
2166	}
2167
2168	return deqs;
2169}
2170
2171static boolean
2172nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2173{
2174	struct tgsi_full_instruction insn = tok->FullInstruction;
2175	const struct tgsi_full_dst_register *fd;
2176	unsigned i, deqs, rdep[4], m[4];
2177
2178	fd = &tok->FullInstruction.FullDstRegisters[0];
2179	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2180
2181	if (is_scalar_op(insn.Instruction.Opcode)) {
2182		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2183		if (!pc->r_brdc)
2184			pc->r_brdc = temp_temp(pc);
2185		return nv50_program_tx_insn(pc, &insn);
2186	}
2187	pc->r_brdc = NULL;
2188
2189	if (!deqs)
2190		return nv50_program_tx_insn(pc, &insn);
2191
2192	deqs = nv50_revdep_reorder(m, rdep);
2193
2194	for (i = 0; i < 4; ++i) {
2195		assert(pc->r_dst[m[i]] == NULL);
2196
2197		insn.FullDstRegisters[0].DstRegister.WriteMask =
2198			fd->DstRegister.WriteMask & (1 << m[i]);
2199
2200		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
2201			continue;
2202
2203		if (deqs & (1 << i))
2204			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2205
2206		if (!nv50_program_tx_insn(pc, &insn))
2207			return FALSE;
2208	}
2209
2210	for (i = 0; i < 4; i++) {
2211		struct nv50_reg *reg = pc->r_dst[i];
2212		if (!reg)
2213			continue;
2214		pc->r_dst[i] = NULL;
2215
2216		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2217			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2218		else
2219			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2220		free_temp(pc, reg);
2221	}
2222
2223	return TRUE;
2224}
2225
2226static void
2227load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2228{
2229	struct nv50_reg *iv, **ppiv;
2230	unsigned mode = pc->interp_mode[reg->index];
2231
2232	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2233	iv = *ppiv;
2234
2235	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2236		iv = *ppiv = alloc_temp(pc, NULL);
2237		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2238
2239		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2240		emit_flop(pc, 0, iv, iv);
2241
2242		/* XXX: when loading interpolants dynamically, move these
2243		 * to the program head, or make sure it can't be skipped.
2244		 */
2245	}
2246
2247	emit_interp(pc, reg, iv, mode);
2248}
2249
2250static boolean
2251nv50_program_tx_prep(struct nv50_pc *pc)
2252{
2253	struct tgsi_parse_context tp;
2254	struct nv50_program *p = pc->p;
2255	boolean ret = FALSE;
2256	unsigned i, c, flat_nr = 0;
2257
2258	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2259	while (!tgsi_parse_end_of_tokens(&tp)) {
2260		const union tgsi_full_token *tok = &tp.FullToken;
2261
2262		tgsi_parse_token(&tp);
2263		switch (tok->Token.Type) {
2264		case TGSI_TOKEN_TYPE_IMMEDIATE:
2265		{
2266			const struct tgsi_full_immediate *imm =
2267				&tp.FullToken.FullImmediate;
2268
2269			ctor_immd(pc, imm->u[0].Float,
2270				      imm->u[1].Float,
2271				      imm->u[2].Float,
2272				      imm->u[3].Float);
2273		}
2274			break;
2275		case TGSI_TOKEN_TYPE_DECLARATION:
2276		{
2277			const struct tgsi_full_declaration *d;
2278			unsigned si, last, first, mode;
2279
2280			d = &tp.FullToken.FullDeclaration;
2281			first = d->DeclarationRange.First;
2282			last = d->DeclarationRange.Last;
2283
2284			switch (d->Declaration.File) {
2285			case TGSI_FILE_TEMPORARY:
2286				break;
2287			case TGSI_FILE_OUTPUT:
2288				if (!d->Declaration.Semantic ||
2289				    p->type == PIPE_SHADER_FRAGMENT)
2290					break;
2291
2292				si = d->Semantic.SemanticIndex;
2293				switch (d->Semantic.SemanticName) {
2294				case TGSI_SEMANTIC_BCOLOR:
2295					p->cfg.two_side[si].hw = first;
2296					if (p->cfg.io_nr > first)
2297						p->cfg.io_nr = first;
2298					break;
2299				case TGSI_SEMANTIC_PSIZE:
2300					p->cfg.psiz = first;
2301					if (p->cfg.io_nr > first)
2302						p->cfg.io_nr = first;
2303					break;
2304					/*
2305				case TGSI_SEMANTIC_CLIP_DISTANCE:
2306					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2307					break;
2308					*/
2309				default:
2310					break;
2311				}
2312				break;
2313			case TGSI_FILE_INPUT:
2314			{
2315				if (p->type != PIPE_SHADER_FRAGMENT)
2316					break;
2317
2318				switch (d->Declaration.Interpolate) {
2319				case TGSI_INTERPOLATE_CONSTANT:
2320					mode = INTERP_FLAT;
2321					flat_nr++;
2322					break;
2323				case TGSI_INTERPOLATE_PERSPECTIVE:
2324					mode = INTERP_PERSPECTIVE;
2325					p->cfg.regs[1] |= 0x08 << 24;
2326					break;
2327				default:
2328					mode = INTERP_LINEAR;
2329					break;
2330				}
2331				if (d->Declaration.Centroid)
2332					mode |= INTERP_CENTROID;
2333
2334				assert(last < 32);
2335				for (i = first; i <= last; i++)
2336					pc->interp_mode[i] = mode;
2337			}
2338				break;
2339			case TGSI_FILE_CONSTANT:
2340				break;
2341			case TGSI_FILE_SAMPLER:
2342				break;
2343			default:
2344				NOUVEAU_ERR("bad decl file %d\n",
2345					    d->Declaration.File);
2346				goto out_err;
2347			}
2348		}
2349			break;
2350		case TGSI_TOKEN_TYPE_INSTRUCTION:
2351			pc->insn_nr++;
2352			prep_inspect_insn(pc, &tok->FullInstruction);
2353			break;
2354		default:
2355			break;
2356		}
2357	}
2358
2359	if (p->type == PIPE_SHADER_VERTEX) {
2360		int rid = 0;
2361
2362		for (i = 0; i < pc->attr_nr * 4; ++i) {
2363			if (pc->attr[i].acc) {
2364				pc->attr[i].hw = rid++;
2365				p->cfg.attr[i / 32] |= 1 << (i % 32);
2366			}
2367		}
2368
2369		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2370			p->cfg.io[i].hw = rid;
2371			p->cfg.io[i].id_vp = i;
2372
2373			for (c = 0; c < 4; ++c) {
2374				int n = i * 4 + c;
2375				if (!pc->result[n].acc)
2376					continue;
2377				pc->result[n].hw = rid++;
2378				p->cfg.io[i].mask |= 1 << c;
2379			}
2380		}
2381
2382		for (c = 0; c < 2; ++c)
2383			if (p->cfg.two_side[c].hw < 0x40)
2384				p->cfg.two_side[c] = p->cfg.io[
2385					p->cfg.two_side[c].hw];
2386
2387		if (p->cfg.psiz < 0x40)
2388			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2389	} else
2390	if (p->type == PIPE_SHADER_FRAGMENT) {
2391		int rid, aid;
2392		unsigned n = 0, m = pc->attr_nr - flat_nr;
2393
2394		int base = (TGSI_SEMANTIC_POSITION ==
2395			    p->info.input_semantic_name[0]) ? 0 : 1;
2396
2397		/* non-flat interpolants have to be mapped to
2398		 * the lower hardware IDs, so sort them:
2399		 */
2400		for (i = 0; i < pc->attr_nr; i++) {
2401			if (pc->interp_mode[i] == INTERP_FLAT) {
2402				p->cfg.io[m].id_vp = i + base;
2403				p->cfg.io[m++].id_fp = i;
2404			} else {
2405				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2406					p->cfg.io[n].linear = TRUE;
2407				p->cfg.io[n].id_vp = i + base;
2408				p->cfg.io[n++].id_fp = i;
2409			}
2410		}
2411
2412		if (!base) /* set w-coordinate mask from perspective interp */
2413			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2414
2415		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2416			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2417
2418		for (n = 0; n < pc->attr_nr; ++n) {
2419			p->cfg.io[n].hw = rid = aid;
2420			i = p->cfg.io[n].id_fp;
2421
2422			for (c = 0; c < 4; ++c) {
2423				if (!pc->attr[i * 4 + c].acc)
2424					continue;
2425				pc->attr[i * 4 + c].rhw = rid++;
2426				p->cfg.io[n].mask |= 1 << c;
2427
2428				load_interpolant(pc, &pc->attr[i * 4 + c]);
2429			}
2430			aid += popcnt4(p->cfg.io[n].mask);
2431		}
2432
2433		if (!base)
2434			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2435
2436		m = popcnt4(p->cfg.regs[1] >> 24);
2437
2438		/* set count of non-position inputs and of non-flat
2439		 * non-position inputs for FP_INTERPOLANT_CTRL
2440		 */
2441		p->cfg.regs[1] |= aid - m;
2442
2443		if (flat_nr) {
2444			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2445			p->cfg.regs[1] |= (i - m) << 16;
2446		} else
2447			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2448
2449		/* mark color semantic for light-twoside */
2450		n = 0x40;
2451		for (i = 0; i < pc->attr_nr; i++) {
2452			ubyte si, sn;
2453
2454			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2455			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2456
2457			if (sn == TGSI_SEMANTIC_COLOR) {
2458				p->cfg.two_side[si] = p->cfg.io[i];
2459
2460				/* increase colour count */
2461				p->cfg.regs[0] += popcnt4(
2462					p->cfg.two_side[si].mask) << 16;
2463
2464				n = MIN2(n, p->cfg.io[i].hw - m);
2465			}
2466		}
2467		if (n < 0x40)
2468			p->cfg.regs[0] += n;
2469
2470		/* Initialize FP results:
2471		 * FragDepth is always first TGSI and last hw output
2472		 */
2473		i = p->info.writes_z ? 4 : 0;
2474		for (rid = 0; i < pc->result_nr * 4; i++)
2475			pc->result[i].rhw = rid++;
2476		if (p->info.writes_z)
2477			pc->result[2].rhw = rid;
2478
2479		p->cfg.high_result = rid;
2480	}
2481
2482	if (pc->immd_nr) {
2483		int rid = 0;
2484
2485		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2486		if (!pc->immd)
2487			goto out_err;
2488
2489		for (i = 0; i < pc->immd_nr; i++) {
2490			for (c = 0; c < 4; c++, rid++)
2491				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2492		}
2493	}
2494
2495	ret = TRUE;
2496out_err:
2497	if (pc->iv_p)
2498		free_temp(pc, pc->iv_p);
2499	if (pc->iv_c)
2500		free_temp(pc, pc->iv_c);
2501
2502	tgsi_parse_free(&tp);
2503	return ret;
2504}
2505
2506static void
2507free_nv50_pc(struct nv50_pc *pc)
2508{
2509	if (pc->immd)
2510		FREE(pc->immd);
2511	if (pc->param)
2512		FREE(pc->param);
2513	if (pc->result)
2514		FREE(pc->result);
2515	if (pc->attr)
2516		FREE(pc->attr);
2517	if (pc->temp)
2518		FREE(pc->temp);
2519
2520	FREE(pc);
2521}
2522
2523static boolean
2524ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2525{
2526	int i, c;
2527	unsigned rtype[2] = { P_ATTR, P_RESULT };
2528
2529	pc->p = p;
2530	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2531	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2532	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2533	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2534
2535	p->cfg.high_temp = 4;
2536
2537	p->cfg.two_side[0].hw = 0x40;
2538	p->cfg.two_side[1].hw = 0x40;
2539
2540	switch (p->type) {
2541	case PIPE_SHADER_VERTEX:
2542		p->cfg.psiz = 0x40;
2543		p->cfg.clpd = 0x40;
2544		p->cfg.io_nr = pc->result_nr;
2545		break;
2546	case PIPE_SHADER_FRAGMENT:
2547		rtype[0] = rtype[1] = P_TEMP;
2548
2549		p->cfg.regs[0] = 0x01000004;
2550		p->cfg.io_nr = pc->attr_nr;
2551
2552		if (p->info.writes_z) {
2553			p->cfg.regs[2] |= 0x00000100;
2554			p->cfg.regs[3] |= 0x00000011;
2555		}
2556		if (p->info.uses_kill)
2557			p->cfg.regs[2] |= 0x00100000;
2558		break;
2559	}
2560
2561	if (pc->temp_nr) {
2562		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2563		if (!pc->temp)
2564			return FALSE;
2565
2566		for (i = 0; i < pc->temp_nr * 4; ++i)
2567			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2568	}
2569
2570	if (pc->attr_nr) {
2571		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2572		if (!pc->attr)
2573			return FALSE;
2574
2575		for (i = 0; i < pc->attr_nr * 4; ++i)
2576			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2577	}
2578
2579	if (pc->result_nr) {
2580		unsigned nr = pc->result_nr * 4;
2581
2582		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2583		if (!pc->result)
2584			return FALSE;
2585
2586		for (i = 0; i < nr; ++i)
2587			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2588	}
2589
2590	if (pc->param_nr) {
2591		int rid = 0;
2592
2593		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2594		if (!pc->param)
2595			return FALSE;
2596
2597		for (i = 0; i < pc->param_nr; ++i)
2598			for (c = 0; c < 4; ++c, ++rid)
2599				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2600	}
2601
2602	return TRUE;
2603}
2604
2605static void
2606nv50_fp_move_results(struct nv50_pc *pc)
2607{
2608	struct nv50_reg reg;
2609	unsigned i;
2610
2611	ctor_reg(&reg, P_TEMP, -1, -1);
2612
2613	for (i = 0; i < pc->result_nr * 4; ++i) {
2614		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2615			continue;
2616		if (pc->result[i].rhw != pc->result[i].hw) {
2617			reg.hw = pc->result[i].rhw;
2618			emit_mov(pc, &reg, &pc->result[i]);
2619		}
2620	}
2621}
2622
2623static void
2624nv50_program_fixup_insns(struct nv50_pc *pc)
2625{
2626	struct nv50_program_exec *e, *prev = NULL, **bra_list;
2627	unsigned i, n, pos;
2628
2629	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2630
2631	/* Collect branch instructions, we need to adjust their offsets
2632	 * when converting 32 bit instructions to 64 bit ones
2633	 */
2634	for (n = 0, e = pc->p->exec_head; e; e = e->next)
2635		if (e->param.index >= 0 && !e->param.mask)
2636			bra_list[n++] = e;
2637
2638	/* Make sure we don't have any single 32 bit instructions. */
2639	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2640		pos += is_long(e) ? 2 : 1;
2641
2642		if ((pos & 1) && (!e->next || is_long(e->next))) {
2643			for (i = 0; i < n; ++i)
2644				if (bra_list[i]->param.index >= pos)
2645					bra_list[i]->param.index += 1;
2646			convert_to_long(pc, e);
2647			++pos;
2648		}
2649		if (e->next)
2650			prev = e;
2651	}
2652
2653	assert(!is_immd(pc->p->exec_head));
2654	assert(!is_immd(pc->p->exec_tail));
2655
2656	/* last instruction must be long so it can have the end bit set */
2657	if (!is_long(pc->p->exec_tail)) {
2658		convert_to_long(pc, pc->p->exec_tail);
2659		if (prev)
2660			convert_to_long(pc, prev);
2661	}
2662	assert(!(pc->p->exec_tail->inst[1] & 2));
2663	/* set the end-bit */
2664	pc->p->exec_tail->inst[1] |= 1;
2665
2666	FREE(bra_list);
2667}
2668
2669static boolean
2670nv50_program_tx(struct nv50_program *p)
2671{
2672	struct tgsi_parse_context parse;
2673	struct nv50_pc *pc;
2674	boolean ret;
2675
2676	pc = CALLOC_STRUCT(nv50_pc);
2677	if (!pc)
2678		return FALSE;
2679
2680	ret = ctor_nv50_pc(pc, p);
2681	if (ret == FALSE)
2682		goto out_cleanup;
2683
2684	ret = nv50_program_tx_prep(pc);
2685	if (ret == FALSE)
2686		goto out_cleanup;
2687
2688	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2689	while (!tgsi_parse_end_of_tokens(&parse)) {
2690		const union tgsi_full_token *tok = &parse.FullToken;
2691
2692		/* don't allow half insn/immd on first and last instruction */
2693		pc->allow32 = TRUE;
2694		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2695			pc->allow32 = FALSE;
2696
2697		tgsi_parse_token(&parse);
2698
2699		switch (tok->Token.Type) {
2700		case TGSI_TOKEN_TYPE_INSTRUCTION:
2701			++pc->insn_cur;
2702			ret = nv50_tgsi_insn(pc, tok);
2703			if (ret == FALSE)
2704				goto out_err;
2705			break;
2706		default:
2707			break;
2708		}
2709	}
2710
2711	if (pc->p->type == PIPE_SHADER_FRAGMENT)
2712		nv50_fp_move_results(pc);
2713
2714	nv50_program_fixup_insns(pc);
2715
2716	p->param_nr = pc->param_nr * 4;
2717	p->immd_nr = pc->immd_nr * 4;
2718	p->immd = pc->immd_buf;
2719
2720out_err:
2721	tgsi_parse_free(&parse);
2722
2723out_cleanup:
2724	free_nv50_pc(pc);
2725	return ret;
2726}
2727
2728static void
2729nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2730{
2731	if (nv50_program_tx(p) == FALSE)
2732		assert(0);
2733	p->translated = TRUE;
2734}
2735
2736static void
2737nv50_program_upload_data(struct nv50_context *nv50, float *map,
2738			unsigned start, unsigned count, unsigned cbuf)
2739{
2740	struct nouveau_channel *chan = nv50->screen->base.channel;
2741	struct nouveau_grobj *tesla = nv50->screen->tesla;
2742
2743	while (count) {
2744		unsigned nr = count > 2047 ? 2047 : count;
2745
2746		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2747		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2748		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2749		OUT_RINGp (chan, map, nr);
2750
2751		map += nr;
2752		start += nr;
2753		count -= nr;
2754	}
2755}
2756
2757static void
2758nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2759{
2760	struct pipe_screen *pscreen = nv50->pipe.screen;
2761
2762	if (!p->data[0] && p->immd_nr) {
2763		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2764
2765		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2766			while (heap->next && heap->size < p->immd_nr) {
2767				struct nv50_program *evict = heap->next->priv;
2768				nouveau_resource_free(&evict->data[0]);
2769			}
2770
2771			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2772						   &p->data[0]))
2773				assert(0);
2774		}
2775
2776		/* immediates only need to be uploaded again when freed */
2777		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2778					 p->immd_nr, NV50_CB_PMISC);
2779	}
2780
2781	assert(p->param_nr <= 128);
2782
2783	if (p->param_nr) {
2784		unsigned cb;
2785		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2786					     PIPE_BUFFER_USAGE_CPU_READ);
2787
2788		if (p->type == PIPE_SHADER_VERTEX)
2789			cb = NV50_CB_PVP;
2790		else
2791			cb = NV50_CB_PFP;
2792
2793		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
2794		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2795	}
2796}
2797
2798static void
2799nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2800{
2801	struct nouveau_channel *chan = nv50->screen->base.channel;
2802	struct nouveau_grobj *tesla = nv50->screen->tesla;
2803	struct nv50_program_exec *e;
2804	struct nouveau_stateobj *so;
2805	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2806	unsigned start, count, *up, *ptr;
2807	boolean upload = FALSE;
2808
2809	if (!p->bo) {
2810		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2811			       p->exec_size * 4, &p->bo);
2812		upload = TRUE;
2813	}
2814
2815	if (p->data[0] && p->data[0]->start != p->data_start[0])
2816		upload = TRUE;
2817
2818	if (!upload)
2819		return;
2820
2821	for (e = p->exec_head; e; e = e->next) {
2822		unsigned ei, ci, bs;
2823
2824		if (e->param.index < 0)
2825			continue;
2826
2827		if (e->param.mask == 0) {
2828			assert(!(e->param.index & 1));
2829			/* seem to be 8 byte steps */
2830			ei = (e->param.index >> 1) + 0 /* START_ID */;
2831
2832			e->inst[0] &= 0xf0000fff;
2833			e->inst[0] |= ei << 12;
2834			continue;
2835		}
2836
2837		bs = (e->inst[1] >> 22) & 0x07;
2838		assert(bs < 2);
2839		ei = e->param.shift >> 5;
2840		ci = e->param.index;
2841		if (bs == 0)
2842			ci += p->data[bs]->start;
2843
2844		e->inst[ei] &= ~e->param.mask;
2845		e->inst[ei] |= (ci << e->param.shift);
2846	}
2847
2848	if (p->data[0])
2849		p->data_start[0] = p->data[0]->start;
2850
2851#ifdef NV50_PROGRAM_DUMP
2852	NOUVEAU_ERR("-------\n");
2853	for (e = p->exec_head; e; e = e->next) {
2854		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2855		if (is_long(e))
2856			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2857	}
2858#endif
2859
2860	up = ptr = MALLOC(p->exec_size * 4);
2861	for (e = p->exec_head; e; e = e->next) {
2862		*(ptr++) = e->inst[0];
2863		if (is_long(e))
2864			*(ptr++) = e->inst[1];
2865	}
2866
2867	so = so_new(4,2);
2868	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2869	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2870	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2871	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2872
2873	start = 0; count = p->exec_size;
2874	while (count) {
2875		struct nouveau_channel *chan = nv50->screen->base.channel;
2876		unsigned nr;
2877
2878		so_emit(chan, so);
2879
2880		nr = MIN2(count, 2047);
2881		nr = MIN2(chan->pushbuf->remaining, nr);
2882		if (chan->pushbuf->remaining < (nr + 3)) {
2883			FIRE_RING(chan);
2884			continue;
2885		}
2886
2887		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2888		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2889		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2890		OUT_RINGp (chan, up + start, nr);
2891
2892		start += nr;
2893		count -= nr;
2894	}
2895
2896	FREE(up);
2897	so_ref(NULL, &so);
2898}
2899
2900void
2901nv50_vertprog_validate(struct nv50_context *nv50)
2902{
2903	struct nouveau_grobj *tesla = nv50->screen->tesla;
2904	struct nv50_program *p = nv50->vertprog;
2905	struct nouveau_stateobj *so;
2906
2907	if (!p->translated) {
2908		nv50_program_validate(nv50, p);
2909		if (!p->translated)
2910			assert(0);
2911	}
2912
2913	nv50_program_validate_data(nv50, p);
2914	nv50_program_validate_code(nv50, p);
2915
2916	so = so_new(13, 2);
2917	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2918	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2919		      NOUVEAU_BO_HIGH, 0, 0);
2920	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2921		      NOUVEAU_BO_LOW, 0, 0);
2922	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2923	so_data  (so, p->cfg.attr[0]);
2924	so_data  (so, p->cfg.attr[1]);
2925	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2926	so_data  (so, p->cfg.high_result);
2927	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2928	so_data  (so, p->cfg.high_result); //8);
2929	so_data  (so, p->cfg.high_temp);
2930	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2931	so_data  (so, 0); /* program start offset */
2932	so_ref(so, &nv50->state.vertprog);
2933	so_ref(NULL, &so);
2934}
2935
2936void
2937nv50_fragprog_validate(struct nv50_context *nv50)
2938{
2939	struct nouveau_grobj *tesla = nv50->screen->tesla;
2940	struct nv50_program *p = nv50->fragprog;
2941	struct nouveau_stateobj *so;
2942
2943	if (!p->translated) {
2944		nv50_program_validate(nv50, p);
2945		if (!p->translated)
2946			assert(0);
2947	}
2948
2949	nv50_program_validate_data(nv50, p);
2950	nv50_program_validate_code(nv50, p);
2951
2952	so = so_new(64, 2);
2953	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2954	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2955		      NOUVEAU_BO_HIGH, 0, 0);
2956	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2957		      NOUVEAU_BO_LOW, 0, 0);
2958	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2959	so_data  (so, p->cfg.high_temp);
2960	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2961	so_data  (so, p->cfg.high_result);
2962	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2963	so_data  (so, p->cfg.regs[2]);
2964	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2965	so_data  (so, p->cfg.regs[3]);
2966	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2967	so_data  (so, 0); /* program start offset */
2968	so_ref(so, &nv50->state.fragprog);
2969	so_ref(NULL, &so);
2970}
2971
2972static void
2973nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
2974{
2975	struct nv50_program *fp = nv50->fragprog;
2976	struct nv50_program *vp = nv50->vertprog;
2977	unsigned i, c, m = base;
2978
2979	/* XXX: This can't work correctly in all cases yet, we either
2980	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
2981	 * to be per FP input instead of per VP output
2982	 */
2983	memset(pntc, 0, 8 * sizeof(uint32_t));
2984
2985	for (i = 0; i < fp->cfg.io_nr; i++) {
2986		uint8_t sn, si;
2987		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
2988		unsigned n = popcnt4(fp->cfg.io[i].mask);
2989
2990		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
2991			m += n;
2992			continue;
2993		}
2994
2995		sn = vp->info.input_semantic_name[j];
2996		si = vp->info.input_semantic_index[j];
2997
2998		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
2999			ubyte mode =
3000				nv50->rasterizer->pipe.sprite_coord_mode[si];
3001
3002			if (mode == PIPE_SPRITE_COORD_NONE) {
3003				m += n;
3004				continue;
3005			}
3006		}
3007
3008		/* this is either PointCoord or replaced by sprite coords */
3009		for (c = 0; c < 4; c++) {
3010			if (!(fp->cfg.io[i].mask & (1 << c)))
3011				continue;
3012			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3013			++m;
3014		}
3015	}
3016}
3017
3018static int
3019nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3020	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3021{
3022	int c;
3023	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3024	uint8_t *map = (uint8_t *)p_map;
3025
3026	for (c = 0; c < 4; ++c) {
3027		if (mf & 1) {
3028			if (fpi->linear == TRUE)
3029				lin[mid / 32] |= 1 << (mid % 32);
3030			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3031		}
3032
3033		oid += mv & 1;
3034		mf >>= 1;
3035		mv >>= 1;
3036	}
3037
3038	return mid;
3039}
3040
3041void
3042nv50_linkage_validate(struct nv50_context *nv50)
3043{
3044	struct nouveau_grobj *tesla = nv50->screen->tesla;
3045	struct nv50_program *vp = nv50->vertprog;
3046	struct nv50_program *fp = nv50->fragprog;
3047	struct nouveau_stateobj *so;
3048	struct nv50_sreg4 dummy, *vpo;
3049	int i, n, c, m = 0;
3050	uint32_t map[16], lin[4], reg[5], pcrd[8];
3051
3052	memset(map, 0, sizeof(map));
3053	memset(lin, 0, sizeof(lin));
3054
3055	reg[1] = 0x00000004; /* low and high clip distance map ids */
3056	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3057	reg[3] = 0x00000000; /* point size map id & enable */
3058	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3059	reg[4] = fp->cfg.regs[1]; /* interpolant info */
3060
3061	dummy.linear = FALSE;
3062	dummy.mask = 0xf; /* map all components of HPOS */
3063	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3064
3065	dummy.mask = 0x0;
3066
3067	if (vp->cfg.clpd < 0x40) {
3068		for (c = 0; c < vp->cfg.clpd_nr; ++c)
3069			map[m++] = vp->cfg.clpd + c;
3070		reg[1] = (m << 8);
3071	}
3072
3073	reg[0] |= m << 8; /* adjust BFC0 id */
3074
3075	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3076	if (nv50->rasterizer->pipe.light_twoside) {
3077		vpo = &vp->cfg.two_side[0];
3078
3079		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3080		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3081	}
3082
3083	reg[0] += m - 4; /* adjust FFC0 id */
3084	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3085
3086	i = 0;
3087	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
3088		i = 1;
3089	for (; i < fp->cfg.io_nr; i++) {
3090		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
3091		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
3092
3093		n = fp->cfg.io[i].id_vp;
3094		if (n >= vp->cfg.io_nr ||
3095		    vp->info.output_semantic_name[n] != sn ||
3096		    vp->info.output_semantic_index[n] != si)
3097			vpo = &dummy;
3098		else
3099			vpo = &vp->cfg.io[n];
3100
3101		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3102	}
3103
3104	if (nv50->rasterizer->pipe.point_size_per_vertex) {
3105		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3106		reg[3] = (m++ << 4) | 1;
3107	}
3108
3109	/* now fill the stateobj */
3110	so = so_new(64, 0);
3111
3112	n = (m + 3) / 4;
3113	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3114	so_data  (so, m);
3115	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3116	so_datap (so, map, n);
3117
3118	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3119	so_datap (so, reg, 4);
3120
3121	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3122	so_data  (so, reg[4]);
3123
3124	so_method(so, tesla, 0x1540, 4);
3125	so_datap (so, lin, 4);
3126
3127	if (nv50->rasterizer->pipe.point_sprite) {
3128		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3129
3130		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3131		so_datap (so, pcrd, 8);
3132	}
3133
3134        so_ref(so, &nv50->state.programs);
3135        so_ref(NULL, &so);
3136}
3137
3138void
3139nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3140{
3141	while (p->exec_head) {
3142		struct nv50_program_exec *e = p->exec_head;
3143
3144		p->exec_head = e->next;
3145		FREE(e);
3146	}
3147	p->exec_tail = NULL;
3148	p->exec_size = 0;
3149
3150	nouveau_bo_ref(NULL, &p->bo);
3151
3152	nouveau_resource_free(&p->data[0]);
3153
3154	p->translated = 0;
3155}
3156