nv50_program.c revision c738c9ab67859f3d4412417333d0f023dd18dc19
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35#define NV50_SU_MAX_ADDR 4
36//#define NV50_PROGRAM_DUMP
37
38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40/* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * 	- Fuck it off, introduce a way to negate args for ops that
44 * 	  support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * 	ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * 	FP attr/result assignment - how?
61 * 		attrib
62 * 			- 0x16bc maps vp output onto fp hpos
63 * 			- 0x16c0 maps vp output onto fp col0
64 * 		result
65 * 			- colr always 0-3
66 * 			- depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * 	      "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * 	- 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * 	- XX == FP high something
78 */
79struct nv50_reg {
80	enum {
81		P_TEMP,
82		P_ATTR,
83		P_RESULT,
84		P_CONST,
85		P_IMMD,
86		P_ADDR
87	} type;
88	int index;
89
90	int hw;
91	int neg;
92
93	int rhw; /* result hw for FP outputs, or interpolant index */
94	int acc; /* instruction where this reg is last read (first insn == 1) */
95};
96
97/* arbitrary limits */
98#define MAX_IF_DEPTH 4
99#define MAX_LOOP_DEPTH 4
100
101struct nv50_pc {
102	struct nv50_program *p;
103
104	/* hw resources */
105	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
106	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
107
108	/* tgsi resources */
109	struct nv50_reg *temp;
110	int temp_nr;
111	struct nv50_reg *attr;
112	int attr_nr;
113	struct nv50_reg *result;
114	int result_nr;
115	struct nv50_reg *param;
116	int param_nr;
117	struct nv50_reg *immd;
118	float *immd_buf;
119	int immd_nr;
120	struct nv50_reg **addr;
121	int addr_nr;
122
123	struct nv50_reg *temp_temp[16];
124	unsigned temp_temp_nr;
125
126	/* broadcast and destination replacement regs */
127	struct nv50_reg *r_brdc;
128	struct nv50_reg *r_dst[4];
129
130	unsigned interp_mode[32];
131	/* perspective interpolation registers */
132	struct nv50_reg *iv_p;
133	struct nv50_reg *iv_c;
134
135	struct nv50_program_exec *if_cond;
136	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
137	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
138	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
139	int if_lvl, loop_lvl;
140	unsigned loop_pos[MAX_LOOP_DEPTH];
141
142	/* current instruction and total number of insns */
143	unsigned insn_cur;
144	unsigned insn_nr;
145
146	boolean allow32;
147};
148
149static INLINE void
150ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
151{
152	reg->type = type;
153	reg->index = index;
154	reg->hw = hw;
155	reg->neg = 0;
156	reg->rhw = -1;
157	reg->acc = 0;
158}
159
160static INLINE unsigned
161popcnt4(uint32_t val)
162{
163	static const unsigned cnt[16]
164	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
165	return cnt[val & 0xf];
166}
167
168static void
169terminate_mbb(struct nv50_pc *pc)
170{
171	int i;
172
173	/* remove records of temporary address register values */
174	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
175		if (pc->r_addr[i].index < 0)
176			pc->r_addr[i].rhw = -1;
177}
178
179static void
180alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
181{
182	int i = 0;
183
184	if (reg->type == P_RESULT) {
185		if (pc->p->cfg.high_result < (reg->hw + 1))
186			pc->p->cfg.high_result = reg->hw + 1;
187	}
188
189	if (reg->type != P_TEMP)
190		return;
191
192	if (reg->hw >= 0) {
193		/*XXX: do this here too to catch FP temp-as-attr usage..
194		 *     not clean, but works */
195		if (pc->p->cfg.high_temp < (reg->hw + 1))
196			pc->p->cfg.high_temp = reg->hw + 1;
197		return;
198	}
199
200	if (reg->rhw != -1) {
201		/* try to allocate temporary with index rhw first */
202		if (!(pc->r_temp[reg->rhw])) {
203			pc->r_temp[reg->rhw] = reg;
204			reg->hw = reg->rhw;
205			if (pc->p->cfg.high_temp < (reg->rhw + 1))
206				pc->p->cfg.high_temp = reg->rhw + 1;
207			return;
208		}
209		/* make sure we don't get things like $r0 needs to go
210		 * in $r1 and $r1 in $r0
211		 */
212		i = pc->result_nr * 4;
213	}
214
215	for (; i < NV50_SU_MAX_TEMP; i++) {
216		if (!(pc->r_temp[i])) {
217			pc->r_temp[i] = reg;
218			reg->hw = i;
219			if (pc->p->cfg.high_temp < (i + 1))
220				pc->p->cfg.high_temp = i + 1;
221			return;
222		}
223	}
224
225	assert(0);
226}
227
228/* XXX: For shaders that aren't executed linearly (e.g. shaders that
229 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
230 * lest we risk temp_temps overwriting regs alloc'd "later".
231 */
232static struct nv50_reg *
233alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
234{
235	struct nv50_reg *r;
236	int i;
237
238	if (dst && dst->type == P_TEMP && dst->hw == -1)
239		return dst;
240
241	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
242		if (!pc->r_temp[i]) {
243			r = MALLOC_STRUCT(nv50_reg);
244			ctor_reg(r, P_TEMP, -1, i);
245			pc->r_temp[i] = r;
246			return r;
247		}
248	}
249
250	assert(0);
251	return NULL;
252}
253
254/* Assign the hw of the discarded temporary register src
255 * to the tgsi register dst and free src.
256 */
257static void
258assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
259{
260	assert(src->index == -1 && src->hw != -1);
261
262	if (dst->hw != -1)
263		pc->r_temp[dst->hw] = NULL;
264	pc->r_temp[src->hw] = dst;
265	dst->hw = src->hw;
266
267	FREE(src);
268}
269
270/* release the hardware resource held by r */
271static void
272release_hw(struct nv50_pc *pc, struct nv50_reg *r)
273{
274	assert(r->type == P_TEMP);
275	if (r->hw == -1)
276		return;
277
278	assert(pc->r_temp[r->hw] == r);
279	pc->r_temp[r->hw] = NULL;
280
281	r->acc = 0;
282	if (r->index == -1)
283		FREE(r);
284}
285
286static void
287free_temp(struct nv50_pc *pc, struct nv50_reg *r)
288{
289	if (r->index == -1) {
290		unsigned hw = r->hw;
291
292		FREE(pc->r_temp[hw]);
293		pc->r_temp[hw] = NULL;
294	}
295}
296
297static int
298alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
299{
300	int i;
301
302	if ((idx + 4) >= NV50_SU_MAX_TEMP)
303		return 1;
304
305	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
306	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
307		return alloc_temp4(pc, dst, idx + 4);
308
309	for (i = 0; i < 4; i++) {
310		dst[i] = MALLOC_STRUCT(nv50_reg);
311		ctor_reg(dst[i], P_TEMP, -1, idx + i);
312		pc->r_temp[idx + i] = dst[i];
313	}
314
315	return 0;
316}
317
318static void
319free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
320{
321	int i;
322
323	for (i = 0; i < 4; i++)
324		free_temp(pc, reg[i]);
325}
326
327static struct nv50_reg *
328temp_temp(struct nv50_pc *pc)
329{
330	if (pc->temp_temp_nr >= 16)
331		assert(0);
332
333	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
334	return pc->temp_temp[pc->temp_temp_nr++];
335}
336
337static void
338kill_temp_temp(struct nv50_pc *pc)
339{
340	int i;
341
342	for (i = 0; i < pc->temp_temp_nr; i++)
343		free_temp(pc, pc->temp_temp[i]);
344	pc->temp_temp_nr = 0;
345}
346
347static int
348ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
349{
350	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
351			       (pc->immd_nr + 1) * 4 * sizeof(float));
352	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
353	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
354	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
355	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
356
357	return pc->immd_nr++;
358}
359
360static struct nv50_reg *
361alloc_immd(struct nv50_pc *pc, float f)
362{
363	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
364	unsigned hw;
365
366	for (hw = 0; hw < pc->immd_nr * 4; hw++)
367		if (pc->immd_buf[hw] == f)
368			break;
369
370	if (hw == pc->immd_nr * 4)
371		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
372
373	ctor_reg(r, P_IMMD, -1, hw);
374	return r;
375}
376
377static struct nv50_program_exec *
378exec(struct nv50_pc *pc)
379{
380	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
381
382	e->param.index = -1;
383	return e;
384}
385
386static void
387emit(struct nv50_pc *pc, struct nv50_program_exec *e)
388{
389	struct nv50_program *p = pc->p;
390
391	if (p->exec_tail)
392		p->exec_tail->next = e;
393	if (!p->exec_head)
394		p->exec_head = e;
395	p->exec_tail = e;
396	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
397}
398
399static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
400
401static boolean
402is_long(struct nv50_program_exec *e)
403{
404	if (e->inst[0] & 1)
405		return TRUE;
406	return FALSE;
407}
408
409static boolean
410is_immd(struct nv50_program_exec *e)
411{
412	if (is_long(e) && (e->inst[1] & 3) == 3)
413		return TRUE;
414	return FALSE;
415}
416
417static INLINE void
418set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
419	 struct nv50_program_exec *e)
420{
421	set_long(pc, e);
422	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
423	e->inst[1] |= (pred << 7) | (idx << 12);
424}
425
426static INLINE void
427set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
428	    struct nv50_program_exec *e)
429{
430	set_long(pc, e);
431	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
432	e->inst[1] |= (idx << 4) | (on << 6);
433}
434
435static INLINE void
436set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
437{
438	if (is_long(e))
439		return;
440
441	e->inst[0] |= 1;
442	set_pred(pc, 0xf, 0, e);
443	set_pred_wr(pc, 0, 0, e);
444}
445
446static INLINE void
447set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
448{
449	if (dst->type == P_RESULT) {
450		set_long(pc, e);
451		e->inst[1] |= 0x00000008;
452	}
453
454	alloc_reg(pc, dst);
455	e->inst[0] |= (dst->hw << 2);
456}
457
458static INLINE void
459set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
460{
461	float f = pc->immd_buf[imm->hw];
462	unsigned val = fui(imm->neg ? -f : f);
463
464	set_long(pc, e);
465	/*XXX: can't be predicated - bits overlap.. catch cases where both
466	 *     are required and avoid them. */
467	set_pred(pc, 0, 0, e);
468	set_pred_wr(pc, 0, 0, e);
469
470	e->inst[1] |= 0x00000002 | 0x00000001;
471	e->inst[0] |= (val & 0x3f) << 16;
472	e->inst[1] |= (val >> 6) << 2;
473}
474
475static INLINE void
476set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
477{
478	assert(!(e->inst[0] & 0x0c000000));
479	assert(!(e->inst[1] & 0x00000004));
480
481	e->inst[0] |= (a->hw & 3) << 26;
482	e->inst[1] |= (a->hw >> 2) << 2;
483}
484
485static void
486emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
487		  struct nv50_reg *src0, uint16_t src1_val)
488{
489	struct nv50_program_exec *e = exec(pc);
490
491	e->inst[0] = 0xd0000000 | (src1_val << 9);
492	e->inst[1] = 0x20000000;
493	set_long(pc, e);
494	e->inst[0] |= dst->hw << 2;
495	if (src0) /* otherwise will add to $a0, which is always 0 */
496		set_addr(e, src0);
497
498	emit(pc, e);
499}
500
501static struct nv50_reg *
502alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
503{
504	int i;
505	struct nv50_reg *a_tgsi = NULL, *a = NULL;
506
507	if (!ref) {
508		/* allocate for TGSI address reg */
509		for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
510			if (pc->r_addr[i].index >= 0)
511				continue;
512			if (pc->r_addr[i].rhw >= 0 &&
513			    pc->r_addr[i].acc == pc->insn_cur)
514				continue;
515
516			pc->r_addr[i].rhw = -1;
517			pc->r_addr[i].index = i;
518			return &pc->r_addr[i];
519		}
520		assert(0);
521		return NULL;
522	}
523
524	/* Allocate and set an address reg so we can access 'ref'.
525	 *
526	 * If and r_addr has index < 0, it is not reserved for TGSI,
527	 * and index will be the negative of the TGSI addr index the
528	 * value in rhw is relative to, or -256 if rhw is an offset
529	 * from 0. If rhw < 0, the reg has not been initialized.
530	 */
531	for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
532		if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
533			continue;
534		if (pc->r_addr[i].rhw < 0) { /* unused */
535			a = &pc->r_addr[i];
536			continue;
537		}
538		if (!a && pc->r_addr[i].acc != pc->insn_cur)
539			a = &pc->r_addr[i];
540
541		if (ref->hw - pc->r_addr[i].rhw >= 128)
542			continue;
543
544		if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
545		    (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
546			pc->r_addr[i].acc = pc->insn_cur;
547			return &pc->r_addr[i];
548		}
549	}
550	assert(a);
551
552	if (ref->acc < 0)
553		a_tgsi = pc->addr[ref->index];
554
555	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
556
557	a->rhw = ref->hw & ~0x7f;
558	a->acc = pc->insn_cur;
559	a->index = a_tgsi ? -ref->index : -256;
560	return a;
561}
562
563#define INTERP_LINEAR		0
564#define INTERP_FLAT		1
565#define INTERP_PERSPECTIVE	2
566#define INTERP_CENTROID		4
567
568/* interpolant index has been stored in dst->rhw */
569static void
570emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
571		unsigned mode)
572{
573	assert(dst->rhw != -1);
574	struct nv50_program_exec *e = exec(pc);
575
576	e->inst[0] |= 0x80000000;
577	set_dst(pc, dst, e);
578	e->inst[0] |= (dst->rhw << 16);
579
580	if (mode & INTERP_FLAT) {
581		e->inst[0] |= (1 << 8);
582	} else {
583		if (mode & INTERP_PERSPECTIVE) {
584			e->inst[0] |= (1 << 25);
585			alloc_reg(pc, iv);
586			e->inst[0] |= (iv->hw << 9);
587		}
588
589		if (mode & INTERP_CENTROID)
590			e->inst[0] |= (1 << 24);
591	}
592
593	emit(pc, e);
594}
595
596static void
597set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
598	 struct nv50_program_exec *e)
599{
600	set_long(pc, e);
601
602	e->param.index = src->hw & 127;
603	e->param.shift = s;
604	e->param.mask = m << (s % 32);
605
606	if (src->hw > 127)
607		set_addr(e, alloc_addr(pc, src));
608	else
609	if (src->acc < 0) {
610		assert(src->type == P_CONST);
611		set_addr(e, pc->addr[src->index]);
612	}
613
614	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
615}
616
617static void
618emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
619{
620	struct nv50_program_exec *e = exec(pc);
621
622	e->inst[0] = 0x10000000;
623	if (!pc->allow32)
624		set_long(pc, e);
625
626	set_dst(pc, dst, e);
627
628	if (!is_long(e) && src->type == P_IMMD) {
629		set_immd(pc, src, e);
630		/*XXX: 32-bit, but steals part of "half" reg space - need to
631		 *     catch and handle this case if/when we do half-regs
632		 */
633	} else
634	if (src->type == P_IMMD || src->type == P_CONST) {
635		set_long(pc, e);
636		set_data(pc, src, 0x7f, 9, e);
637		e->inst[1] |= 0x20000000; /* src0 const? */
638	} else {
639		if (src->type == P_ATTR) {
640			set_long(pc, e);
641			e->inst[1] |= 0x00200000;
642		}
643
644		alloc_reg(pc, src);
645		e->inst[0] |= (src->hw << 9);
646	}
647
648	if (is_long(e) && !is_immd(e)) {
649		e->inst[1] |= 0x04000000; /* 32-bit */
650		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
651		if (!(e->inst[1] & 0x20000000))
652			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
653	} else
654		e->inst[0] |= 0x00008000;
655
656	emit(pc, e);
657}
658
659static INLINE void
660emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
661{
662	struct nv50_reg *imm = alloc_immd(pc, f);
663	emit_mov(pc, dst, imm);
664	FREE(imm);
665}
666
667static boolean
668check_swap_src_0_1(struct nv50_pc *pc,
669		   struct nv50_reg **s0, struct nv50_reg **s1)
670{
671	struct nv50_reg *src0 = *s0, *src1 = *s1;
672
673	if (src0->type == P_CONST) {
674		if (src1->type != P_CONST) {
675			*s0 = src1;
676			*s1 = src0;
677			return TRUE;
678		}
679	} else
680	if (src1->type == P_ATTR) {
681		if (src0->type != P_ATTR) {
682			*s0 = src1;
683			*s1 = src0;
684			return TRUE;
685		}
686	}
687
688	return FALSE;
689}
690
691static void
692set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
693		     struct nv50_program_exec *e)
694{
695	struct nv50_reg *temp;
696
697	if (src->type != P_TEMP) {
698		temp = temp_temp(pc);
699		emit_mov(pc, temp, src);
700		src = temp;
701	}
702
703	alloc_reg(pc, src);
704	e->inst[0] |= (src->hw << 9);
705}
706
707static void
708set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
709{
710	if (src->type == P_ATTR) {
711		set_long(pc, e);
712		e->inst[1] |= 0x00200000;
713	} else
714	if (src->type == P_CONST || src->type == P_IMMD) {
715		struct nv50_reg *temp = temp_temp(pc);
716
717		emit_mov(pc, temp, src);
718		src = temp;
719	}
720
721	alloc_reg(pc, src);
722	e->inst[0] |= (src->hw << 9);
723}
724
725static void
726set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
727{
728	if (src->type == P_ATTR) {
729		struct nv50_reg *temp = temp_temp(pc);
730
731		emit_mov(pc, temp, src);
732		src = temp;
733	} else
734	if (src->type == P_CONST || src->type == P_IMMD) {
735		assert(!(e->inst[0] & 0x00800000));
736		if (e->inst[0] & 0x01000000) {
737			struct nv50_reg *temp = temp_temp(pc);
738
739			emit_mov(pc, temp, src);
740			src = temp;
741		} else {
742			set_data(pc, src, 0x7f, 16, e);
743			e->inst[0] |= 0x00800000;
744		}
745	}
746
747	alloc_reg(pc, src);
748	e->inst[0] |= ((src->hw & 127) << 16);
749}
750
751static void
752set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
753{
754	set_long(pc, e);
755
756	if (src->type == P_ATTR) {
757		struct nv50_reg *temp = temp_temp(pc);
758
759		emit_mov(pc, temp, src);
760		src = temp;
761	} else
762	if (src->type == P_CONST || src->type == P_IMMD) {
763		assert(!(e->inst[0] & 0x01000000));
764		if (e->inst[0] & 0x00800000) {
765			struct nv50_reg *temp = temp_temp(pc);
766
767			emit_mov(pc, temp, src);
768			src = temp;
769		} else {
770			set_data(pc, src, 0x7f, 32+14, e);
771			e->inst[0] |= 0x01000000;
772		}
773	}
774
775	alloc_reg(pc, src);
776	e->inst[1] |= ((src->hw & 127) << 14);
777}
778
779static void
780emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
781	 struct nv50_reg *src1)
782{
783	struct nv50_program_exec *e = exec(pc);
784
785	e->inst[0] |= 0xc0000000;
786
787	if (!pc->allow32)
788		set_long(pc, e);
789
790	check_swap_src_0_1(pc, &src0, &src1);
791	set_dst(pc, dst, e);
792	set_src_0(pc, src0, e);
793	if (src1->type == P_IMMD && !is_long(e)) {
794		if (src0->neg)
795			e->inst[0] |= 0x00008000;
796		set_immd(pc, src1, e);
797	} else {
798		set_src_1(pc, src1, e);
799		if (src0->neg ^ src1->neg) {
800			if (is_long(e))
801				e->inst[1] |= 0x08000000;
802			else
803				e->inst[0] |= 0x00008000;
804		}
805	}
806
807	emit(pc, e);
808}
809
810static void
811emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
812	 struct nv50_reg *src0, struct nv50_reg *src1)
813{
814	struct nv50_program_exec *e = exec(pc);
815
816	e->inst[0] |= 0xb0000000;
817
818	check_swap_src_0_1(pc, &src0, &src1);
819
820	if (!pc->allow32 || src0->neg || src1->neg) {
821		set_long(pc, e);
822		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
823	}
824
825	set_dst(pc, dst, e);
826	set_src_0(pc, src0, e);
827	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
828		set_src_2(pc, src1, e);
829	else
830	if (src1->type == P_IMMD)
831		set_immd(pc, src1, e);
832	else
833		set_src_1(pc, src1, e);
834
835	emit(pc, e);
836}
837
838static void
839emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
840	 uint8_t s)
841{
842	struct nv50_program_exec *e = exec(pc);
843
844	set_long(pc, e);
845	e->inst[1] |= 0xc0000000;
846
847	e->inst[0] |= dst->hw << 2;
848	e->inst[0] |= s << 16; /* shift left */
849	set_src_0_restricted(pc, src, e);
850
851	emit(pc, e);
852}
853
854static void
855emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
856	    struct nv50_reg *src0, struct nv50_reg *src1)
857{
858	struct nv50_program_exec *e = exec(pc);
859
860	set_long(pc, e);
861	e->inst[0] |= 0xb0000000;
862	e->inst[1] |= (sub << 29);
863
864	check_swap_src_0_1(pc, &src0, &src1);
865	set_dst(pc, dst, e);
866	set_src_0(pc, src0, e);
867	set_src_1(pc, src1, e);
868
869	emit(pc, e);
870}
871
872static INLINE void
873emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
874	 struct nv50_reg *src1)
875{
876	src1->neg ^= 1;
877	emit_add(pc, dst, src0, src1);
878	src1->neg ^= 1;
879}
880
881static void
882emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
883	 struct nv50_reg *src1, struct nv50_reg *src2)
884{
885	struct nv50_program_exec *e = exec(pc);
886
887	e->inst[0] |= 0xe0000000;
888
889	check_swap_src_0_1(pc, &src0, &src1);
890	set_dst(pc, dst, e);
891	set_src_0(pc, src0, e);
892	set_src_1(pc, src1, e);
893	set_src_2(pc, src2, e);
894
895	if (src0->neg ^ src1->neg)
896		e->inst[1] |= 0x04000000;
897	if (src2->neg)
898		e->inst[1] |= 0x08000000;
899
900	emit(pc, e);
901}
902
903static INLINE void
904emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
905	 struct nv50_reg *src1, struct nv50_reg *src2)
906{
907	src2->neg ^= 1;
908	emit_mad(pc, dst, src0, src1, src2);
909	src2->neg ^= 1;
910}
911
912static void
913emit_flop(struct nv50_pc *pc, unsigned sub,
914	  struct nv50_reg *dst, struct nv50_reg *src)
915{
916	struct nv50_program_exec *e = exec(pc);
917
918	e->inst[0] |= 0x90000000;
919	if (sub) {
920		set_long(pc, e);
921		e->inst[1] |= (sub << 29);
922	}
923
924	set_dst(pc, dst, e);
925
926	if (sub == 0 || sub == 2)
927		set_src_0_restricted(pc, src, e);
928	else
929		set_src_0(pc, src, e);
930
931	emit(pc, e);
932}
933
934static void
935emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
936{
937	struct nv50_program_exec *e = exec(pc);
938
939	e->inst[0] |= 0xb0000000;
940
941	set_dst(pc, dst, e);
942	set_src_0(pc, src, e);
943	set_long(pc, e);
944	e->inst[1] |= (6 << 29) | 0x00004000;
945
946	emit(pc, e);
947}
948
949static void
950emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
951{
952	struct nv50_program_exec *e = exec(pc);
953
954	e->inst[0] |= 0xb0000000;
955
956	set_dst(pc, dst, e);
957	set_src_0(pc, src, e);
958	set_long(pc, e);
959	e->inst[1] |= (6 << 29);
960
961	emit(pc, e);
962}
963
964#define CVTOP_RN	0x01
965#define CVTOP_FLOOR	0x03
966#define CVTOP_CEIL	0x05
967#define CVTOP_TRUNC	0x07
968#define CVTOP_SAT	0x08
969#define CVTOP_ABS	0x10
970
971/* 0x04 == 32 bit dst */
972/* 0x40 == dst is float */
973/* 0x80 == src is float */
974#define CVT_F32_F32 0xc4
975#define CVT_F32_S32 0x44
976#define CVT_F32_U32 0x64
977#define CVT_S32_F32 0x8c
978#define CVT_S32_S32 0x0c
979#define CVT_NEG     0x20
980#define CVT_RI      0x08
981
982static void
983emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
984	 int wp, unsigned cvn, unsigned fmt)
985{
986	struct nv50_program_exec *e;
987
988	e = exec(pc);
989	set_long(pc, e);
990
991	e->inst[0] |= 0xa0000000;
992	e->inst[1] |= 0x00004000; /* 32 bit src */
993	e->inst[1] |= (cvn << 16);
994	e->inst[1] |= (fmt << 24);
995	set_src_0(pc, src, e);
996
997	if (wp >= 0)
998		set_pred_wr(pc, 1, wp, e);
999
1000	if (dst)
1001		set_dst(pc, dst, e);
1002	else {
1003		e->inst[0] |= 0x000001fc;
1004		e->inst[1] |= 0x00000008;
1005	}
1006
1007	emit(pc, e);
1008}
1009
1010/* nv50 Condition codes:
1011 *  0x1 = LT
1012 *  0x2 = EQ
1013 *  0x3 = LE
1014 *  0x4 = GT
1015 *  0x5 = NE
1016 *  0x6 = GE
1017 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1018 *  0x8 = unordered bit (allows NaN)
1019 */
1020static void
1021emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1022	 struct nv50_reg *src0, struct nv50_reg *src1)
1023{
1024	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1025
1026	struct nv50_program_exec *e = exec(pc);
1027	struct nv50_reg *rdst;
1028
1029	assert(ccode < 16);
1030	if (check_swap_src_0_1(pc, &src0, &src1))
1031		ccode = cc_swapped[ccode & 7] | (ccode & 8);
1032
1033	rdst = dst;
1034	if (dst && dst->type != P_TEMP)
1035		dst = alloc_temp(pc, NULL);
1036
1037	/* set.u32 */
1038	set_long(pc, e);
1039	e->inst[0] |= 0xb0000000;
1040	e->inst[1] |= 0x60000000 | (ccode << 14);
1041
1042	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1043	 * that doesn't seem to match what the hw actually does
1044	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1045	 */
1046
1047	if (wp >= 0)
1048		set_pred_wr(pc, 1, wp, e);
1049	if (dst)
1050		set_dst(pc, dst, e);
1051	else {
1052		e->inst[0] |= 0x000001fc;
1053		e->inst[1] |= 0x00000008;
1054	}
1055
1056	set_src_0(pc, src0, e);
1057	set_src_1(pc, src1, e);
1058
1059	emit(pc, e);
1060	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
1061
1062	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1063	if (rdst)
1064		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1065	if (rdst && rdst != dst)
1066		free_temp(pc, dst);
1067}
1068
1069static INLINE unsigned
1070map_tgsi_setop_cc(unsigned op)
1071{
1072	switch (op) {
1073	case TGSI_OPCODE_SLT: return 0x1;
1074	case TGSI_OPCODE_SGE: return 0x6;
1075	case TGSI_OPCODE_SEQ: return 0x2;
1076	case TGSI_OPCODE_SGT: return 0x4;
1077	case TGSI_OPCODE_SLE: return 0x3;
1078	case TGSI_OPCODE_SNE: return 0xd;
1079	default:
1080		assert(0);
1081		return 0;
1082	}
1083}
1084
1085static INLINE void
1086emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1087{
1088	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1089}
1090
1091static void
1092emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1093	 struct nv50_reg *v, struct nv50_reg *e)
1094{
1095	struct nv50_reg *temp = alloc_temp(pc, NULL);
1096
1097	emit_flop(pc, 3, temp, v);
1098	emit_mul(pc, temp, temp, e);
1099	emit_preex2(pc, temp, temp);
1100	emit_flop(pc, 6, dst, temp);
1101
1102	free_temp(pc, temp);
1103}
1104
1105static INLINE void
1106emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1107{
1108	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1109}
1110
1111static INLINE void
1112emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1113{
1114	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1115}
1116
1117static void
1118emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1119	 struct nv50_reg **src)
1120{
1121	struct nv50_reg *one = alloc_immd(pc, 1.0);
1122	struct nv50_reg *zero = alloc_immd(pc, 0.0);
1123	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1124	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
1125	struct nv50_reg *tmp[4];
1126	boolean allow32 = pc->allow32;
1127
1128	pc->allow32 = FALSE;
1129
1130	if (mask & (3 << 1)) {
1131		tmp[0] = alloc_temp(pc, NULL);
1132		emit_minmax(pc, 4, tmp[0], src[0], zero);
1133	}
1134
1135	if (mask & (1 << 2)) {
1136		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1137
1138		tmp[1] = temp_temp(pc);
1139		emit_minmax(pc, 4, tmp[1], src[1], zero);
1140
1141		tmp[3] = temp_temp(pc);
1142		emit_minmax(pc, 4, tmp[3], src[3], neg128);
1143		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1144
1145		emit_pow(pc, dst[2], tmp[1], tmp[3]);
1146		emit_mov(pc, dst[2], zero);
1147		set_pred(pc, 3, 0, pc->p->exec_tail);
1148	}
1149
1150	if (mask & (1 << 1))
1151		assimilate_temp(pc, dst[1], tmp[0]);
1152	else
1153	if (mask & (1 << 2))
1154		free_temp(pc, tmp[0]);
1155
1156	pc->allow32 = allow32;
1157
1158	/* do this last, in case src[i,j] == dst[0,3] */
1159	if (mask & (1 << 0))
1160		emit_mov(pc, dst[0], one);
1161
1162	if (mask & (1 << 3))
1163		emit_mov(pc, dst[3], one);
1164
1165	FREE(pos128);
1166	FREE(neg128);
1167	FREE(zero);
1168	FREE(one);
1169}
1170
1171static INLINE void
1172emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1173{
1174	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1175}
1176
1177static void
1178emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1179{
1180	struct nv50_program_exec *e;
1181	const int r_pred = 1;
1182	unsigned cvn = CVT_F32_F32;
1183
1184	if (src->neg)
1185		cvn |= CVT_NEG;
1186	/* write predicate reg */
1187	emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1188
1189	/* conditional discard */
1190	e = exec(pc);
1191	e->inst[0] = 0x00000002;
1192	set_long(pc, e);
1193	set_pred(pc, 0x1 /* LT */, r_pred, e);
1194	emit(pc, e);
1195}
1196
1197static void
1198emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1199	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1200{
1201	struct nv50_reg *temp, *t[4];
1202	struct nv50_program_exec *e;
1203
1204	unsigned c, mode, dim;
1205
1206	switch (type) {
1207	case TGSI_TEXTURE_1D:
1208		dim = 1;
1209		break;
1210	case TGSI_TEXTURE_UNKNOWN:
1211	case TGSI_TEXTURE_2D:
1212	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1213	case TGSI_TEXTURE_RECT:
1214		dim = 2;
1215		break;
1216	case TGSI_TEXTURE_3D:
1217	case TGSI_TEXTURE_CUBE:
1218	case TGSI_TEXTURE_SHADOW2D:
1219	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1220		dim = 3;
1221		break;
1222	default:
1223		assert(0);
1224		break;
1225	}
1226
1227	/* some cards need t[0]'s hw index to be a multiple of 4 */
1228	alloc_temp4(pc, t, 0);
1229
1230	if (proj) {
1231		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1232			mode = pc->interp_mode[src[0]->index];
1233
1234			t[3]->rhw = src[3]->rhw;
1235			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1236			emit_flop(pc, 0, t[3], t[3]);
1237
1238			for (c = 0; c < dim; c++) {
1239				t[c]->rhw = src[c]->rhw;
1240				emit_interp(pc, t[c], t[3],
1241					    (mode | INTERP_PERSPECTIVE));
1242			}
1243		} else {
1244			emit_flop(pc, 0, t[3], src[3]);
1245			for (c = 0; c < dim; c++)
1246				emit_mul(pc, t[c], src[c], t[3]);
1247
1248			/* XXX: for some reason the blob sometimes uses MAD:
1249			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1250			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1251			 */
1252		}
1253	} else {
1254		if (type == TGSI_TEXTURE_CUBE) {
1255			temp = temp_temp(pc);
1256			emit_minmax(pc, 4, temp, src[0], src[1]);
1257			emit_minmax(pc, 4, temp, temp, src[2]);
1258			emit_flop(pc, 0, temp, temp);
1259			for (c = 0; c < 3; c++)
1260				emit_mul(pc, t[c], src[c], temp);
1261		} else {
1262			for (c = 0; c < dim; c++)
1263				emit_mov(pc, t[c], src[c]);
1264		}
1265	}
1266
1267	e = exec(pc);
1268	set_long(pc, e);
1269	e->inst[0] |= 0xf0000000;
1270	e->inst[1] |= 0x00000004;
1271	set_dst(pc, t[0], e);
1272	e->inst[0] |= (unit << 9);
1273
1274	if (dim == 2)
1275		e->inst[0] |= 0x00400000;
1276	else
1277	if (dim == 3)
1278		e->inst[0] |= 0x00800000;
1279
1280	e->inst[0] |= (mask & 0x3) << 25;
1281	e->inst[1] |= (mask & 0xc) << 12;
1282
1283	emit(pc, e);
1284
1285#if 1
1286	c = 0;
1287	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1288	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1289	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1290	if (mask & 8) emit_mov(pc, dst[3], t[c]);
1291
1292	free_temp4(pc, t);
1293#else
1294	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1295	 * the texture coordinates, not the fetched values: latency ? */
1296
1297	for (c = 0; c < 4; c++) {
1298		if (mask & (1 << c))
1299			assimilate_temp(pc, dst[c], t[c]);
1300		else
1301			free_temp(pc, t[c]);
1302	}
1303#endif
1304}
1305
1306static void
1307emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1308	    struct nv50_program_exec **join)
1309{
1310	struct nv50_program_exec *e = exec(pc);
1311
1312	if (join) {
1313		set_long(pc, e);
1314		e->inst[0] |= 0xa0000002;
1315		emit(pc, e);
1316		*join = e;
1317		e = exec(pc);
1318	}
1319
1320	set_long(pc, e);
1321	e->inst[0] |= 0x10000002;
1322	if (pred >= 0)
1323		set_pred(pc, cc, pred, e);
1324	emit(pc, e);
1325}
1326
1327static void
1328emit_nop(struct nv50_pc *pc)
1329{
1330	struct nv50_program_exec *e = exec(pc);
1331
1332	e->inst[0] = 0xf0000000;
1333	set_long(pc, e);
1334	e->inst[1] = 0xe0000000;
1335	emit(pc, e);
1336}
1337
1338static void
1339emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1340{
1341	struct nv50_program_exec *e = exec(pc);
1342
1343	assert(src->type == P_TEMP);
1344
1345	e->inst[0] = 0xc0140000;
1346	e->inst[1] = 0x89800000;
1347	set_long(pc, e);
1348	set_dst(pc, dst, e);
1349	set_src_0(pc, src, e);
1350	set_src_2(pc, src, e);
1351
1352	emit(pc, e);
1353}
1354
1355static void
1356emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1357{
1358	struct nv50_program_exec *e = exec(pc);
1359
1360	assert(src->type == P_TEMP);
1361
1362	if (!src->neg) /* ! double negation */
1363		emit_neg(pc, src, src);
1364
1365	e->inst[0] = 0xc0150000;
1366	e->inst[1] = 0x8a400000;
1367	set_long(pc, e);
1368	set_dst(pc, dst, e);
1369	set_src_0(pc, src, e);
1370	set_src_2(pc, src, e);
1371
1372	emit(pc, e);
1373}
1374
1375static void
1376convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1377{
1378	unsigned q = 0, m = ~0;
1379
1380	assert(!is_long(e));
1381
1382	switch (e->inst[0] >> 28) {
1383	case 0x1:
1384		/* MOV */
1385		q = 0x0403c000;
1386		m = 0xffff7fff;
1387		break;
1388	case 0x8:
1389		/* INTERP (move centroid, perspective and flat bits) */
1390		m = ~0x03000100;
1391		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1392		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1393		break;
1394	case 0x9:
1395		/* RCP */
1396		break;
1397	case 0xB:
1398		/* ADD */
1399		m = ~(127 << 16);
1400		q = ((e->inst[0] & (~m)) >> 2);
1401		break;
1402	case 0xC:
1403		/* MUL */
1404		m = ~0x00008000;
1405		q = ((e->inst[0] & (~m)) << 12);
1406		break;
1407	case 0xE:
1408		/* MAD (if src2 == dst) */
1409		q = ((e->inst[0] & 0x1fc) << 12);
1410		break;
1411	default:
1412		assert(0);
1413		break;
1414	}
1415
1416	set_long(pc, e);
1417	pc->p->exec_size++;
1418
1419	e->inst[0] &= m;
1420	e->inst[1] |= q;
1421}
1422
1423/* Some operations support an optional negation flag. */
1424static boolean
1425negate_supported(const struct tgsi_full_instruction *insn, int i)
1426{
1427	int s;
1428
1429	switch (insn->Instruction.Opcode) {
1430	case TGSI_OPCODE_DDY:
1431	case TGSI_OPCODE_DP3:
1432	case TGSI_OPCODE_DP4:
1433	case TGSI_OPCODE_MUL:
1434	case TGSI_OPCODE_KIL:
1435	case TGSI_OPCODE_ADD:
1436	case TGSI_OPCODE_SUB:
1437	case TGSI_OPCODE_MAD:
1438		break;
1439	case TGSI_OPCODE_POW:
1440		if (i == 1)
1441			break;
1442		return FALSE;
1443	default:
1444		return FALSE;
1445	}
1446
1447	/* Watch out for possible multiple uses of an nv50_reg, we
1448	 * can't use nv50_reg::neg in these cases.
1449	 */
1450	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
1451		if (s == i)
1452			continue;
1453		if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
1454		     insn->FullSrcRegisters[i].SrcRegister.Index) &&
1455		    (insn->FullSrcRegisters[s].SrcRegister.File ==
1456		     insn->FullSrcRegisters[i].SrcRegister.File))
1457			return FALSE;
1458	}
1459
1460	return TRUE;
1461}
1462
1463/* Return a read mask for source registers deduced from opcode & write mask. */
1464static unsigned
1465nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1466{
1467	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1468
1469	switch (insn->Instruction.Opcode) {
1470	case TGSI_OPCODE_COS:
1471	case TGSI_OPCODE_SIN:
1472		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1473	case TGSI_OPCODE_DP3:
1474		return 0x7;
1475	case TGSI_OPCODE_DP4:
1476	case TGSI_OPCODE_DPH:
1477	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1478		return 0xf;
1479	case TGSI_OPCODE_DST:
1480		return mask & (c ? 0xa : 0x6);
1481	case TGSI_OPCODE_EX2:
1482	case TGSI_OPCODE_LG2:
1483	case TGSI_OPCODE_POW:
1484	case TGSI_OPCODE_RCP:
1485	case TGSI_OPCODE_RSQ:
1486	case TGSI_OPCODE_SCS:
1487		return 0x1;
1488	case TGSI_OPCODE_LIT:
1489		return 0xb;
1490	case TGSI_OPCODE_TEX:
1491	case TGSI_OPCODE_TXP:
1492	{
1493		const struct tgsi_instruction_ext_texture *tex;
1494
1495		assert(insn->Instruction.Extended);
1496		tex = &insn->InstructionExtTexture;
1497
1498		mask = 0x7;
1499		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1500			mask |= 0x8;
1501
1502		switch (tex->Texture) {
1503		case TGSI_TEXTURE_1D:
1504			mask &= 0x9;
1505			break;
1506		case TGSI_TEXTURE_2D:
1507			mask &= 0xb;
1508			break;
1509		default:
1510			break;
1511		}
1512	}
1513		return mask;
1514	case TGSI_OPCODE_XPD:
1515		x = 0;
1516		if (mask & 1) x |= 0x6;
1517		if (mask & 2) x |= 0x5;
1518		if (mask & 4) x |= 0x3;
1519		return x;
1520	default:
1521		break;
1522	}
1523
1524	return mask;
1525}
1526
1527static struct nv50_reg *
1528tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1529{
1530	switch (dst->DstRegister.File) {
1531	case TGSI_FILE_TEMPORARY:
1532		return &pc->temp[dst->DstRegister.Index * 4 + c];
1533	case TGSI_FILE_OUTPUT:
1534		return &pc->result[dst->DstRegister.Index * 4 + c];
1535	case TGSI_FILE_ADDRESS:
1536	{
1537		struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c];
1538		if (!r) {
1539			r = alloc_addr(pc, NULL);
1540			pc->addr[dst->DstRegister.Index * 4 + c] = r;
1541		}
1542		assert(r);
1543		return r;
1544	}
1545	case TGSI_FILE_NULL:
1546		return NULL;
1547	default:
1548		break;
1549	}
1550
1551	return NULL;
1552}
1553
1554static struct nv50_reg *
1555tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1556	 boolean neg)
1557{
1558	struct nv50_reg *r = NULL;
1559	struct nv50_reg *temp;
1560	unsigned sgn, c, swz;
1561
1562	if (src->SrcRegister.File != TGSI_FILE_CONSTANT)
1563		assert(!src->SrcRegister.Indirect);
1564
1565	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1566
1567	c = tgsi_util_get_full_src_register_swizzle(src, chan);
1568	switch (c) {
1569	case TGSI_SWIZZLE_X:
1570	case TGSI_SWIZZLE_Y:
1571	case TGSI_SWIZZLE_Z:
1572	case TGSI_SWIZZLE_W:
1573		switch (src->SrcRegister.File) {
1574		case TGSI_FILE_INPUT:
1575			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1576			break;
1577		case TGSI_FILE_TEMPORARY:
1578			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1579			break;
1580		case TGSI_FILE_CONSTANT:
1581			if (!src->SrcRegister.Indirect) {
1582				r = &pc->param[src->SrcRegister.Index * 4 + c];
1583				break;
1584			}
1585			/* Indicate indirection by setting r->acc < 0 and
1586			 * use the index field to select the address reg.
1587			 */
1588			r = MALLOC_STRUCT(nv50_reg);
1589			swz = tgsi_util_get_src_register_swizzle(
1590						 &src->SrcRegisterInd, 0);
1591			ctor_reg(r, P_CONST,
1592				 src->SrcRegisterInd.Index * 4 + swz,
1593				 src->SrcRegister.Index * 4 + c);
1594			r->acc = -1;
1595			break;
1596		case TGSI_FILE_IMMEDIATE:
1597			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1598			break;
1599		case TGSI_FILE_SAMPLER:
1600			break;
1601		case TGSI_FILE_ADDRESS:
1602			r = pc->addr[src->SrcRegister.Index * 4 + c];
1603			assert(r);
1604			break;
1605		default:
1606			assert(0);
1607			break;
1608		}
1609		break;
1610	default:
1611		assert(0);
1612		break;
1613	}
1614
1615	switch (sgn) {
1616	case TGSI_UTIL_SIGN_KEEP:
1617		break;
1618	case TGSI_UTIL_SIGN_CLEAR:
1619		temp = temp_temp(pc);
1620		emit_abs(pc, temp, r);
1621		r = temp;
1622		break;
1623	case TGSI_UTIL_SIGN_TOGGLE:
1624		if (neg)
1625			r->neg = 1;
1626		else {
1627			temp = temp_temp(pc);
1628			emit_neg(pc, temp, r);
1629			r = temp;
1630		}
1631		break;
1632	case TGSI_UTIL_SIGN_SET:
1633		temp = temp_temp(pc);
1634		emit_abs(pc, temp, r);
1635		if (neg)
1636			temp->neg = 1;
1637		else
1638			emit_neg(pc, temp, temp);
1639		r = temp;
1640		break;
1641	default:
1642		assert(0);
1643		break;
1644	}
1645
1646	return r;
1647}
1648
1649/* return TRUE for ops that produce only a single result */
1650static boolean
1651is_scalar_op(unsigned op)
1652{
1653	switch (op) {
1654	case TGSI_OPCODE_COS:
1655	case TGSI_OPCODE_DP2:
1656	case TGSI_OPCODE_DP3:
1657	case TGSI_OPCODE_DP4:
1658	case TGSI_OPCODE_DPH:
1659	case TGSI_OPCODE_EX2:
1660	case TGSI_OPCODE_LG2:
1661	case TGSI_OPCODE_POW:
1662	case TGSI_OPCODE_RCP:
1663	case TGSI_OPCODE_RSQ:
1664	case TGSI_OPCODE_SIN:
1665		/*
1666	case TGSI_OPCODE_KIL:
1667	case TGSI_OPCODE_LIT:
1668	case TGSI_OPCODE_SCS:
1669		*/
1670		return TRUE;
1671	default:
1672		return FALSE;
1673	}
1674}
1675
1676/* Returns a bitmask indicating which dst components depend
1677 * on source s, component c (reverse of nv50_tgsi_src_mask).
1678 */
1679static unsigned
1680nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1681{
1682	if (is_scalar_op(op))
1683		return 0x1;
1684
1685	switch (op) {
1686	case TGSI_OPCODE_DST:
1687		return (1 << c) & (s ? 0xa : 0x6);
1688	case TGSI_OPCODE_XPD:
1689		switch (c) {
1690		case 0: return 0x6;
1691		case 1: return 0x5;
1692		case 2: return 0x3;
1693		case 3: return 0x0;
1694		default:
1695			assert(0);
1696			return 0x0;
1697		}
1698	case TGSI_OPCODE_LIT:
1699	case TGSI_OPCODE_SCS:
1700	case TGSI_OPCODE_TEX:
1701	case TGSI_OPCODE_TXP:
1702		/* these take care of dangerous swizzles themselves */
1703		return 0x0;
1704	case TGSI_OPCODE_IF:
1705	case TGSI_OPCODE_KIL:
1706		/* don't call this function for these ops */
1707		assert(0);
1708		return 0;
1709	default:
1710		/* linear vector instruction */
1711		return (1 << c);
1712	}
1713}
1714
1715static INLINE boolean
1716has_pred(struct nv50_program_exec *e, unsigned cc)
1717{
1718	if (!is_long(e) || is_immd(e))
1719		return FALSE;
1720	return ((e->inst[1] & 0x780) == (cc << 7));
1721}
1722
1723/* on ENDIF see if we can do "@p0.neu single_op" instead of:
1724 *        join_at ENDIF
1725 *        @p0.eq bra ENDIF
1726 *        single_op
1727 * ENDIF: nop.join
1728 */
1729static boolean
1730nv50_kill_branch(struct nv50_pc *pc)
1731{
1732	int lvl = pc->if_lvl;
1733
1734	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
1735		return FALSE;
1736
1737	/* if ccode == 'true', the BRA is from an ELSE and the predicate
1738	 * reg may no longer be valid, since we currently always use $p0
1739	 */
1740	if (has_pred(pc->if_insn[lvl], 0xf))
1741		return FALSE;
1742	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
1743
1744	/* We'll use the exec allocated for JOIN_AT (as we can't easily
1745	 * update prev's next); if exec_tail is BRK, update the pointer.
1746	 */
1747	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
1748		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
1749
1750	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
1751
1752	*pc->br_join[lvl] = *pc->p->exec_tail;
1753
1754	FREE(pc->if_insn[lvl]);
1755	FREE(pc->p->exec_tail);
1756
1757	pc->p->exec_tail = pc->br_join[lvl];
1758	pc->p->exec_tail->next = NULL;
1759	set_pred(pc, 0xd, 0, pc->p->exec_tail);
1760
1761	return TRUE;
1762}
1763
1764static boolean
1765nv50_program_tx_insn(struct nv50_pc *pc,
1766		     const struct tgsi_full_instruction *inst)
1767{
1768	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1769	unsigned mask, sat, unit;
1770	int i, c;
1771
1772	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1773	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1774
1775	memset(src, 0, sizeof(src));
1776
1777	for (c = 0; c < 4; c++) {
1778		if ((mask & (1 << c)) && !pc->r_dst[c])
1779			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1780		else
1781			dst[c] = pc->r_dst[c];
1782		rdst[c] = dst[c];
1783	}
1784
1785	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1786		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1787		unsigned src_mask;
1788		boolean neg_supp;
1789
1790		src_mask = nv50_tgsi_src_mask(inst, i);
1791		neg_supp = negate_supported(inst, i);
1792
1793		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1794			unit = fs->SrcRegister.Index;
1795
1796		for (c = 0; c < 4; c++)
1797			if (src_mask & (1 << c))
1798				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1799	}
1800
1801	brdc = temp = pc->r_brdc;
1802	if (brdc && brdc->type != P_TEMP) {
1803		temp = temp_temp(pc);
1804		if (sat)
1805			brdc = temp;
1806	} else
1807	if (sat) {
1808		for (c = 0; c < 4; c++) {
1809			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1810				continue;
1811			rdst[c] = dst[c];
1812			dst[c] = temp_temp(pc);
1813		}
1814	}
1815
1816	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1817
1818	switch (inst->Instruction.Opcode) {
1819	case TGSI_OPCODE_ABS:
1820		for (c = 0; c < 4; c++) {
1821			if (!(mask & (1 << c)))
1822				continue;
1823			emit_abs(pc, dst[c], src[0][c]);
1824		}
1825		break;
1826	case TGSI_OPCODE_ADD:
1827		for (c = 0; c < 4; c++) {
1828			if (!(mask & (1 << c)))
1829				continue;
1830			emit_add(pc, dst[c], src[0][c], src[1][c]);
1831		}
1832		break;
1833	case TGSI_OPCODE_ARL:
1834		assert(src[0][0]);
1835		temp = temp_temp(pc);
1836		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
1837		emit_arl(pc, dst[0], temp, 4);
1838		break;
1839	case TGSI_OPCODE_BGNLOOP:
1840		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
1841		terminate_mbb(pc);
1842		break;
1843	case TGSI_OPCODE_BRK:
1844		emit_branch(pc, -1, 0, NULL);
1845		assert(pc->loop_lvl > 0);
1846		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
1847		break;
1848	case TGSI_OPCODE_CEIL:
1849		for (c = 0; c < 4; c++) {
1850			if (!(mask & (1 << c)))
1851				continue;
1852			emit_cvt(pc, dst[c], src[0][c], -1,
1853				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
1854		}
1855		break;
1856	case TGSI_OPCODE_CMP:
1857		pc->allow32 = FALSE;
1858		for (c = 0; c < 4; c++) {
1859			if (!(mask & (1 << c)))
1860				continue;
1861			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
1862			emit_mov(pc, dst[c], src[1][c]);
1863			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
1864			emit_mov(pc, dst[c], src[2][c]);
1865			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
1866		}
1867		break;
1868	case TGSI_OPCODE_COS:
1869		if (mask & 8) {
1870			emit_precossin(pc, temp, src[0][3]);
1871			emit_flop(pc, 5, dst[3], temp);
1872			if (!(mask &= 7))
1873				break;
1874			if (temp == dst[3])
1875				temp = brdc = temp_temp(pc);
1876		}
1877		emit_precossin(pc, temp, src[0][0]);
1878		emit_flop(pc, 5, brdc, temp);
1879		break;
1880	case TGSI_OPCODE_DDX:
1881		for (c = 0; c < 4; c++) {
1882			if (!(mask & (1 << c)))
1883				continue;
1884			emit_ddx(pc, dst[c], src[0][c]);
1885		}
1886		break;
1887	case TGSI_OPCODE_DDY:
1888		for (c = 0; c < 4; c++) {
1889			if (!(mask & (1 << c)))
1890				continue;
1891			emit_ddy(pc, dst[c], src[0][c]);
1892		}
1893		break;
1894	case TGSI_OPCODE_DP3:
1895		emit_mul(pc, temp, src[0][0], src[1][0]);
1896		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1897		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1898		break;
1899	case TGSI_OPCODE_DP4:
1900		emit_mul(pc, temp, src[0][0], src[1][0]);
1901		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1902		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1903		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1904		break;
1905	case TGSI_OPCODE_DPH:
1906		emit_mul(pc, temp, src[0][0], src[1][0]);
1907		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1908		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1909		emit_add(pc, brdc, src[1][3], temp);
1910		break;
1911	case TGSI_OPCODE_DST:
1912		if (mask & (1 << 1))
1913			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1914		if (mask & (1 << 2))
1915			emit_mov(pc, dst[2], src[0][2]);
1916		if (mask & (1 << 3))
1917			emit_mov(pc, dst[3], src[1][3]);
1918		if (mask & (1 << 0))
1919			emit_mov_immdval(pc, dst[0], 1.0f);
1920		break;
1921	case TGSI_OPCODE_ELSE:
1922		emit_branch(pc, -1, 0, NULL);
1923		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1924		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1925		terminate_mbb(pc);
1926		break;
1927	case TGSI_OPCODE_ENDIF:
1928		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1929
1930		/* try to replace branch over 1 insn with a predicated insn */
1931		if (nv50_kill_branch(pc) == TRUE)
1932			break;
1933
1934		if (pc->br_join[pc->if_lvl]) {
1935			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
1936			pc->br_join[pc->if_lvl] = NULL;
1937		}
1938		terminate_mbb(pc);
1939		/* emit a NOP as join point, we could set it on the next
1940		 * one, but would have to make sure it is long and !immd
1941		 */
1942		emit_nop(pc);
1943		pc->p->exec_tail->inst[1] |= 2;
1944		break;
1945	case TGSI_OPCODE_ENDLOOP:
1946		emit_branch(pc, -1, 0, NULL);
1947		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
1948		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
1949		terminate_mbb(pc);
1950		break;
1951	case TGSI_OPCODE_EX2:
1952		emit_preex2(pc, temp, src[0][0]);
1953		emit_flop(pc, 6, brdc, temp);
1954		break;
1955	case TGSI_OPCODE_FLR:
1956		for (c = 0; c < 4; c++) {
1957			if (!(mask & (1 << c)))
1958				continue;
1959			emit_flr(pc, dst[c], src[0][c]);
1960		}
1961		break;
1962	case TGSI_OPCODE_FRC:
1963		temp = temp_temp(pc);
1964		for (c = 0; c < 4; c++) {
1965			if (!(mask & (1 << c)))
1966				continue;
1967			emit_flr(pc, temp, src[0][c]);
1968			emit_sub(pc, dst[c], src[0][c], temp);
1969		}
1970		break;
1971	case TGSI_OPCODE_IF:
1972		/* emitting a join_at may not be necessary */
1973		assert(pc->if_lvl < MAX_IF_DEPTH);
1974		set_pred_wr(pc, 1, 0, pc->if_cond);
1975		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
1976		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1977		terminate_mbb(pc);
1978		break;
1979	case TGSI_OPCODE_KIL:
1980		emit_kil(pc, src[0][0]);
1981		emit_kil(pc, src[0][1]);
1982		emit_kil(pc, src[0][2]);
1983		emit_kil(pc, src[0][3]);
1984		break;
1985	case TGSI_OPCODE_LIT:
1986		emit_lit(pc, &dst[0], mask, &src[0][0]);
1987		break;
1988	case TGSI_OPCODE_LG2:
1989		emit_flop(pc, 3, brdc, src[0][0]);
1990		break;
1991	case TGSI_OPCODE_LRP:
1992		temp = temp_temp(pc);
1993		for (c = 0; c < 4; c++) {
1994			if (!(mask & (1 << c)))
1995				continue;
1996			emit_sub(pc, temp, src[1][c], src[2][c]);
1997			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1998		}
1999		break;
2000	case TGSI_OPCODE_MAD:
2001		for (c = 0; c < 4; c++) {
2002			if (!(mask & (1 << c)))
2003				continue;
2004			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2005		}
2006		break;
2007	case TGSI_OPCODE_MAX:
2008		for (c = 0; c < 4; c++) {
2009			if (!(mask & (1 << c)))
2010				continue;
2011			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2012		}
2013		break;
2014	case TGSI_OPCODE_MIN:
2015		for (c = 0; c < 4; c++) {
2016			if (!(mask & (1 << c)))
2017				continue;
2018			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2019		}
2020		break;
2021	case TGSI_OPCODE_MOV:
2022		for (c = 0; c < 4; c++) {
2023			if (!(mask & (1 << c)))
2024				continue;
2025			emit_mov(pc, dst[c], src[0][c]);
2026		}
2027		break;
2028	case TGSI_OPCODE_MUL:
2029		for (c = 0; c < 4; c++) {
2030			if (!(mask & (1 << c)))
2031				continue;
2032			emit_mul(pc, dst[c], src[0][c], src[1][c]);
2033		}
2034		break;
2035	case TGSI_OPCODE_POW:
2036		emit_pow(pc, brdc, src[0][0], src[1][0]);
2037		break;
2038	case TGSI_OPCODE_RCP:
2039		emit_flop(pc, 0, brdc, src[0][0]);
2040		break;
2041	case TGSI_OPCODE_RSQ:
2042		emit_flop(pc, 2, brdc, src[0][0]);
2043		break;
2044	case TGSI_OPCODE_SCS:
2045		temp = temp_temp(pc);
2046		if (mask & 3)
2047			emit_precossin(pc, temp, src[0][0]);
2048		if (mask & (1 << 0))
2049			emit_flop(pc, 5, dst[0], temp);
2050		if (mask & (1 << 1))
2051			emit_flop(pc, 4, dst[1], temp);
2052		if (mask & (1 << 2))
2053			emit_mov_immdval(pc, dst[2], 0.0);
2054		if (mask & (1 << 3))
2055			emit_mov_immdval(pc, dst[3], 1.0);
2056		break;
2057	case TGSI_OPCODE_SIN:
2058		if (mask & 8) {
2059			emit_precossin(pc, temp, src[0][3]);
2060			emit_flop(pc, 4, dst[3], temp);
2061			if (!(mask &= 7))
2062				break;
2063			if (temp == dst[3])
2064				temp = brdc = temp_temp(pc);
2065		}
2066		emit_precossin(pc, temp, src[0][0]);
2067		emit_flop(pc, 4, brdc, temp);
2068		break;
2069	case TGSI_OPCODE_SLT:
2070	case TGSI_OPCODE_SGE:
2071	case TGSI_OPCODE_SEQ:
2072	case TGSI_OPCODE_SGT:
2073	case TGSI_OPCODE_SLE:
2074	case TGSI_OPCODE_SNE:
2075		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2076		for (c = 0; c < 4; c++) {
2077			if (!(mask & (1 << c)))
2078				continue;
2079			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2080		}
2081		break;
2082	case TGSI_OPCODE_SUB:
2083		for (c = 0; c < 4; c++) {
2084			if (!(mask & (1 << c)))
2085				continue;
2086			emit_sub(pc, dst[c], src[0][c], src[1][c]);
2087		}
2088		break;
2089	case TGSI_OPCODE_TEX:
2090		emit_tex(pc, dst, mask, src[0], unit,
2091			 inst->InstructionExtTexture.Texture, FALSE);
2092		break;
2093	case TGSI_OPCODE_TXP:
2094		emit_tex(pc, dst, mask, src[0], unit,
2095			 inst->InstructionExtTexture.Texture, TRUE);
2096		break;
2097	case TGSI_OPCODE_TRUNC:
2098		for (c = 0; c < 4; c++) {
2099			if (!(mask & (1 << c)))
2100				continue;
2101			emit_cvt(pc, dst[c], src[0][c], -1,
2102				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2103		}
2104		break;
2105	case TGSI_OPCODE_XPD:
2106		temp = temp_temp(pc);
2107		if (mask & (1 << 0)) {
2108			emit_mul(pc, temp, src[0][2], src[1][1]);
2109			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2110		}
2111		if (mask & (1 << 1)) {
2112			emit_mul(pc, temp, src[0][0], src[1][2]);
2113			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2114		}
2115		if (mask & (1 << 2)) {
2116			emit_mul(pc, temp, src[0][1], src[1][0]);
2117			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2118		}
2119		if (mask & (1 << 3))
2120			emit_mov_immdval(pc, dst[3], 1.0);
2121		break;
2122	case TGSI_OPCODE_END:
2123		break;
2124	default:
2125		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2126		return FALSE;
2127	}
2128
2129	if (brdc) {
2130		if (sat)
2131			emit_sat(pc, brdc, brdc);
2132		for (c = 0; c < 4; c++)
2133			if ((mask & (1 << c)) && dst[c] != brdc)
2134				emit_mov(pc, dst[c], brdc);
2135	} else
2136	if (sat) {
2137		for (c = 0; c < 4; c++) {
2138			if (!(mask & (1 << c)))
2139				continue;
2140			/* in this case we saturate later */
2141			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
2142				continue;
2143			emit_sat(pc, rdst[c], dst[c]);
2144		}
2145	}
2146
2147	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2148		for (c = 0; c < 4; c++) {
2149			if (!src[i][c])
2150				continue;
2151			src[i][c]->neg = 0;
2152			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
2153				FREE(src[i][c]);
2154			else
2155			if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
2156				FREE(src[i][c]); /* indirect constant */
2157		}
2158	}
2159
2160	kill_temp_temp(pc);
2161	return TRUE;
2162}
2163
2164static void
2165prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2166{
2167	struct nv50_reg *reg = NULL;
2168	const struct tgsi_full_src_register *src;
2169	const struct tgsi_dst_register *dst;
2170	unsigned i, c, k, mask;
2171
2172	dst = &insn->FullDstRegisters[0].DstRegister;
2173	mask = dst->WriteMask;
2174
2175        if (dst->File == TGSI_FILE_TEMPORARY)
2176                reg = pc->temp;
2177        else
2178        if (dst->File == TGSI_FILE_OUTPUT)
2179                reg = pc->result;
2180
2181	if (reg) {
2182		for (c = 0; c < 4; c++) {
2183			if (!(mask & (1 << c)))
2184				continue;
2185			reg[dst->Index * 4 + c].acc = pc->insn_nr;
2186		}
2187	}
2188
2189	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2190		src = &insn->FullSrcRegisters[i];
2191
2192		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
2193			reg = pc->temp;
2194		else
2195		if (src->SrcRegister.File == TGSI_FILE_INPUT)
2196			reg = pc->attr;
2197		else
2198			continue;
2199
2200		mask = nv50_tgsi_src_mask(insn, i);
2201
2202		for (c = 0; c < 4; c++) {
2203			if (!(mask & (1 << c)))
2204				continue;
2205			k = tgsi_util_get_full_src_register_swizzle(src, c);
2206
2207			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
2208		}
2209	}
2210}
2211
2212/* Returns a bitmask indicating which dst components need to be
2213 * written to temporaries first to avoid 'corrupting' sources.
2214 *
2215 * m[i]   (out) indicate component to write in the i-th position
2216 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2217 */
2218static unsigned
2219nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2220{
2221	unsigned i, c, x, unsafe;
2222
2223	for (c = 0; c < 4; c++)
2224		m[c] = c;
2225
2226	/* Swap as long as a dst component written earlier is depended on
2227	 * by one written later, but the next one isn't depended on by it.
2228	 */
2229	for (c = 0; c < 3; c++) {
2230		if (rdep[m[c + 1]] & (1 << m[c]))
2231			continue; /* if next one is depended on by us */
2232		for (i = c + 1; i < 4; i++)
2233			/* if we are depended on by a later one */
2234			if (rdep[m[c]] & (1 << m[i]))
2235				break;
2236		if (i == 4)
2237			continue;
2238		/* now, swap */
2239		x = m[c];
2240		m[c] = m[c + 1];
2241		m[c + 1] = x;
2242
2243		/* restart */
2244		c = 0;
2245	}
2246
2247	/* mark dependencies that could not be resolved by reordering */
2248	for (i = 0; i < 3; ++i)
2249		for (c = i + 1; c < 4; ++c)
2250			if (rdep[m[i]] & (1 << m[c]))
2251				unsafe |= (1 << i);
2252
2253	/* NOTE: $unsafe is with respect to order, not component */
2254	return unsafe;
2255}
2256
2257/* Select a suitable dst register for broadcasting scalar results,
2258 * or return NULL if we have to allocate an extra TEMP.
2259 *
2260 * If e.g. only 1 component is written, we may also emit the final
2261 * result to a write-only register.
2262 */
2263static struct nv50_reg *
2264tgsi_broadcast_dst(struct nv50_pc *pc,
2265		   const struct tgsi_full_dst_register *fd, unsigned mask)
2266{
2267	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
2268		int c = ffs(~mask & fd->DstRegister.WriteMask);
2269		if (c)
2270			return tgsi_dst(pc, c - 1, fd);
2271	} else {
2272		int c = ffs(fd->DstRegister.WriteMask) - 1;
2273		if ((1 << c) == fd->DstRegister.WriteMask)
2274			return tgsi_dst(pc, c, fd);
2275	}
2276
2277	return NULL;
2278}
2279
2280/* Scan source swizzles and return a bitmask indicating dst regs that
2281 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2282 */
2283static unsigned
2284nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2285		       unsigned rdep[4])
2286{
2287	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
2288	const struct tgsi_full_src_register *fs;
2289	unsigned i, deqs = 0;
2290
2291	for (i = 0; i < 4; ++i)
2292		rdep[i] = 0;
2293
2294	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2295		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2296		boolean neg_supp = negate_supported(insn, i);
2297
2298		fs = &insn->FullSrcRegisters[i];
2299		if (fs->SrcRegister.File != fd->DstRegister.File ||
2300		    fs->SrcRegister.Index != fd->DstRegister.Index)
2301			continue;
2302
2303		for (chn = 0; chn < 4; ++chn) {
2304			unsigned s, c;
2305
2306			if (!(mask & (1 << chn))) /* src is not read */
2307				continue;
2308			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2309			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2310
2311			if (!(fd->DstRegister.WriteMask & (1 << c)))
2312				continue;
2313
2314			/* no danger if src is copied to TEMP first */
2315			if ((s != TGSI_UTIL_SIGN_KEEP) &&
2316			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2317				continue;
2318
2319			rdep[c] |= nv50_tgsi_dst_revdep(
2320				insn->Instruction.Opcode, i, chn);
2321			deqs |= (1 << c);
2322		}
2323	}
2324
2325	return deqs;
2326}
2327
2328static boolean
2329nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2330{
2331	struct tgsi_full_instruction insn = tok->FullInstruction;
2332	const struct tgsi_full_dst_register *fd;
2333	unsigned i, deqs, rdep[4], m[4];
2334
2335	fd = &tok->FullInstruction.FullDstRegisters[0];
2336	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2337
2338	if (is_scalar_op(insn.Instruction.Opcode)) {
2339		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2340		if (!pc->r_brdc)
2341			pc->r_brdc = temp_temp(pc);
2342		return nv50_program_tx_insn(pc, &insn);
2343	}
2344	pc->r_brdc = NULL;
2345
2346	if (!deqs)
2347		return nv50_program_tx_insn(pc, &insn);
2348
2349	deqs = nv50_revdep_reorder(m, rdep);
2350
2351	for (i = 0; i < 4; ++i) {
2352		assert(pc->r_dst[m[i]] == NULL);
2353
2354		insn.FullDstRegisters[0].DstRegister.WriteMask =
2355			fd->DstRegister.WriteMask & (1 << m[i]);
2356
2357		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
2358			continue;
2359
2360		if (deqs & (1 << i))
2361			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2362
2363		if (!nv50_program_tx_insn(pc, &insn))
2364			return FALSE;
2365	}
2366
2367	for (i = 0; i < 4; i++) {
2368		struct nv50_reg *reg = pc->r_dst[i];
2369		if (!reg)
2370			continue;
2371		pc->r_dst[i] = NULL;
2372
2373		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2374			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2375		else
2376			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2377		free_temp(pc, reg);
2378	}
2379
2380	return TRUE;
2381}
2382
2383static void
2384load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2385{
2386	struct nv50_reg *iv, **ppiv;
2387	unsigned mode = pc->interp_mode[reg->index];
2388
2389	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2390	iv = *ppiv;
2391
2392	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2393		iv = *ppiv = alloc_temp(pc, NULL);
2394		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2395
2396		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2397		emit_flop(pc, 0, iv, iv);
2398
2399		/* XXX: when loading interpolants dynamically, move these
2400		 * to the program head, or make sure it can't be skipped.
2401		 */
2402	}
2403
2404	emit_interp(pc, reg, iv, mode);
2405}
2406
2407static boolean
2408nv50_program_tx_prep(struct nv50_pc *pc)
2409{
2410	struct tgsi_parse_context tp;
2411	struct nv50_program *p = pc->p;
2412	boolean ret = FALSE;
2413	unsigned i, c, flat_nr = 0;
2414
2415	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2416	while (!tgsi_parse_end_of_tokens(&tp)) {
2417		const union tgsi_full_token *tok = &tp.FullToken;
2418
2419		tgsi_parse_token(&tp);
2420		switch (tok->Token.Type) {
2421		case TGSI_TOKEN_TYPE_IMMEDIATE:
2422		{
2423			const struct tgsi_full_immediate *imm =
2424				&tp.FullToken.FullImmediate;
2425
2426			ctor_immd(pc, imm->u[0].Float,
2427				      imm->u[1].Float,
2428				      imm->u[2].Float,
2429				      imm->u[3].Float);
2430		}
2431			break;
2432		case TGSI_TOKEN_TYPE_DECLARATION:
2433		{
2434			const struct tgsi_full_declaration *d;
2435			unsigned si, last, first, mode;
2436
2437			d = &tp.FullToken.FullDeclaration;
2438			first = d->DeclarationRange.First;
2439			last = d->DeclarationRange.Last;
2440
2441			switch (d->Declaration.File) {
2442			case TGSI_FILE_TEMPORARY:
2443				break;
2444			case TGSI_FILE_OUTPUT:
2445				if (!d->Declaration.Semantic ||
2446				    p->type == PIPE_SHADER_FRAGMENT)
2447					break;
2448
2449				si = d->Semantic.SemanticIndex;
2450				switch (d->Semantic.SemanticName) {
2451				case TGSI_SEMANTIC_BCOLOR:
2452					p->cfg.two_side[si].hw = first;
2453					if (p->cfg.io_nr > first)
2454						p->cfg.io_nr = first;
2455					break;
2456				case TGSI_SEMANTIC_PSIZE:
2457					p->cfg.psiz = first;
2458					if (p->cfg.io_nr > first)
2459						p->cfg.io_nr = first;
2460					break;
2461					/*
2462				case TGSI_SEMANTIC_CLIP_DISTANCE:
2463					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2464					break;
2465					*/
2466				default:
2467					break;
2468				}
2469				break;
2470			case TGSI_FILE_INPUT:
2471			{
2472				if (p->type != PIPE_SHADER_FRAGMENT)
2473					break;
2474
2475				switch (d->Declaration.Interpolate) {
2476				case TGSI_INTERPOLATE_CONSTANT:
2477					mode = INTERP_FLAT;
2478					flat_nr++;
2479					break;
2480				case TGSI_INTERPOLATE_PERSPECTIVE:
2481					mode = INTERP_PERSPECTIVE;
2482					p->cfg.regs[1] |= 0x08 << 24;
2483					break;
2484				default:
2485					mode = INTERP_LINEAR;
2486					break;
2487				}
2488				if (d->Declaration.Centroid)
2489					mode |= INTERP_CENTROID;
2490
2491				assert(last < 32);
2492				for (i = first; i <= last; i++)
2493					pc->interp_mode[i] = mode;
2494			}
2495				break;
2496			case TGSI_FILE_ADDRESS:
2497			case TGSI_FILE_CONSTANT:
2498			case TGSI_FILE_SAMPLER:
2499				break;
2500			default:
2501				NOUVEAU_ERR("bad decl file %d\n",
2502					    d->Declaration.File);
2503				goto out_err;
2504			}
2505		}
2506			break;
2507		case TGSI_TOKEN_TYPE_INSTRUCTION:
2508			pc->insn_nr++;
2509			prep_inspect_insn(pc, &tok->FullInstruction);
2510			break;
2511		default:
2512			break;
2513		}
2514	}
2515
2516	if (p->type == PIPE_SHADER_VERTEX) {
2517		int rid = 0;
2518
2519		for (i = 0; i < pc->attr_nr * 4; ++i) {
2520			if (pc->attr[i].acc) {
2521				pc->attr[i].hw = rid++;
2522				p->cfg.attr[i / 32] |= 1 << (i % 32);
2523			}
2524		}
2525
2526		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2527			p->cfg.io[i].hw = rid;
2528			p->cfg.io[i].id_vp = i;
2529
2530			for (c = 0; c < 4; ++c) {
2531				int n = i * 4 + c;
2532				if (!pc->result[n].acc)
2533					continue;
2534				pc->result[n].hw = rid++;
2535				p->cfg.io[i].mask |= 1 << c;
2536			}
2537		}
2538
2539		for (c = 0; c < 2; ++c)
2540			if (p->cfg.two_side[c].hw < 0x40)
2541				p->cfg.two_side[c] = p->cfg.io[
2542					p->cfg.two_side[c].hw];
2543
2544		if (p->cfg.psiz < 0x40)
2545			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2546	} else
2547	if (p->type == PIPE_SHADER_FRAGMENT) {
2548		int rid, aid;
2549		unsigned n = 0, m = pc->attr_nr - flat_nr;
2550
2551		int base = (TGSI_SEMANTIC_POSITION ==
2552			    p->info.input_semantic_name[0]) ? 0 : 1;
2553
2554		/* non-flat interpolants have to be mapped to
2555		 * the lower hardware IDs, so sort them:
2556		 */
2557		for (i = 0; i < pc->attr_nr; i++) {
2558			if (pc->interp_mode[i] == INTERP_FLAT) {
2559				p->cfg.io[m].id_vp = i + base;
2560				p->cfg.io[m++].id_fp = i;
2561			} else {
2562				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2563					p->cfg.io[n].linear = TRUE;
2564				p->cfg.io[n].id_vp = i + base;
2565				p->cfg.io[n++].id_fp = i;
2566			}
2567		}
2568
2569		if (!base) /* set w-coordinate mask from perspective interp */
2570			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2571
2572		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2573			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2574
2575		for (n = 0; n < pc->attr_nr; ++n) {
2576			p->cfg.io[n].hw = rid = aid;
2577			i = p->cfg.io[n].id_fp;
2578
2579			for (c = 0; c < 4; ++c) {
2580				if (!pc->attr[i * 4 + c].acc)
2581					continue;
2582				pc->attr[i * 4 + c].rhw = rid++;
2583				p->cfg.io[n].mask |= 1 << c;
2584
2585				load_interpolant(pc, &pc->attr[i * 4 + c]);
2586			}
2587			aid += popcnt4(p->cfg.io[n].mask);
2588		}
2589
2590		if (!base)
2591			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2592
2593		m = popcnt4(p->cfg.regs[1] >> 24);
2594
2595		/* set count of non-position inputs and of non-flat
2596		 * non-position inputs for FP_INTERPOLANT_CTRL
2597		 */
2598		p->cfg.regs[1] |= aid - m;
2599
2600		if (flat_nr) {
2601			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2602			p->cfg.regs[1] |= (i - m) << 16;
2603		} else
2604			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2605
2606		/* mark color semantic for light-twoside */
2607		n = 0x40;
2608		for (i = 0; i < pc->attr_nr; i++) {
2609			ubyte si, sn;
2610
2611			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2612			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2613
2614			if (sn == TGSI_SEMANTIC_COLOR) {
2615				p->cfg.two_side[si] = p->cfg.io[i];
2616
2617				/* increase colour count */
2618				p->cfg.regs[0] += popcnt4(
2619					p->cfg.two_side[si].mask) << 16;
2620
2621				n = MIN2(n, p->cfg.io[i].hw - m);
2622			}
2623		}
2624		if (n < 0x40)
2625			p->cfg.regs[0] += n;
2626
2627		/* Initialize FP results:
2628		 * FragDepth is always first TGSI and last hw output
2629		 */
2630		i = p->info.writes_z ? 4 : 0;
2631		for (rid = 0; i < pc->result_nr * 4; i++)
2632			pc->result[i].rhw = rid++;
2633		if (p->info.writes_z)
2634			pc->result[2].rhw = rid;
2635
2636		p->cfg.high_result = rid;
2637	}
2638
2639	if (pc->immd_nr) {
2640		int rid = 0;
2641
2642		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2643		if (!pc->immd)
2644			goto out_err;
2645
2646		for (i = 0; i < pc->immd_nr; i++) {
2647			for (c = 0; c < 4; c++, rid++)
2648				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2649		}
2650	}
2651
2652	ret = TRUE;
2653out_err:
2654	if (pc->iv_p)
2655		free_temp(pc, pc->iv_p);
2656	if (pc->iv_c)
2657		free_temp(pc, pc->iv_c);
2658
2659	tgsi_parse_free(&tp);
2660	return ret;
2661}
2662
2663static void
2664free_nv50_pc(struct nv50_pc *pc)
2665{
2666	if (pc->immd)
2667		FREE(pc->immd);
2668	if (pc->param)
2669		FREE(pc->param);
2670	if (pc->result)
2671		FREE(pc->result);
2672	if (pc->attr)
2673		FREE(pc->attr);
2674	if (pc->temp)
2675		FREE(pc->temp);
2676
2677	FREE(pc);
2678}
2679
2680static boolean
2681ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2682{
2683	int i, c;
2684	unsigned rtype[2] = { P_ATTR, P_RESULT };
2685
2686	pc->p = p;
2687	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2688	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2689	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2690	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2691	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
2692	assert(pc->addr_nr <= 2);
2693
2694	p->cfg.high_temp = 4;
2695
2696	p->cfg.two_side[0].hw = 0x40;
2697	p->cfg.two_side[1].hw = 0x40;
2698
2699	switch (p->type) {
2700	case PIPE_SHADER_VERTEX:
2701		p->cfg.psiz = 0x40;
2702		p->cfg.clpd = 0x40;
2703		p->cfg.io_nr = pc->result_nr;
2704		break;
2705	case PIPE_SHADER_FRAGMENT:
2706		rtype[0] = rtype[1] = P_TEMP;
2707
2708		p->cfg.regs[0] = 0x01000004;
2709		p->cfg.io_nr = pc->attr_nr;
2710
2711		if (p->info.writes_z) {
2712			p->cfg.regs[2] |= 0x00000100;
2713			p->cfg.regs[3] |= 0x00000011;
2714		}
2715		if (p->info.uses_kill)
2716			p->cfg.regs[2] |= 0x00100000;
2717		break;
2718	}
2719
2720	if (pc->temp_nr) {
2721		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2722		if (!pc->temp)
2723			return FALSE;
2724
2725		for (i = 0; i < pc->temp_nr * 4; ++i)
2726			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2727	}
2728
2729	if (pc->attr_nr) {
2730		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2731		if (!pc->attr)
2732			return FALSE;
2733
2734		for (i = 0; i < pc->attr_nr * 4; ++i)
2735			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2736	}
2737
2738	if (pc->result_nr) {
2739		unsigned nr = pc->result_nr * 4;
2740
2741		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2742		if (!pc->result)
2743			return FALSE;
2744
2745		for (i = 0; i < nr; ++i)
2746			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2747	}
2748
2749	if (pc->param_nr) {
2750		int rid = 0;
2751
2752		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2753		if (!pc->param)
2754			return FALSE;
2755
2756		for (i = 0; i < pc->param_nr; ++i)
2757			for (c = 0; c < 4; ++c, ++rid)
2758				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2759	}
2760
2761	if (pc->addr_nr) {
2762		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
2763		if (!pc->addr)
2764			return FALSE;
2765	}
2766	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
2767		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
2768
2769	return TRUE;
2770}
2771
2772static void
2773nv50_fp_move_results(struct nv50_pc *pc)
2774{
2775	struct nv50_reg reg;
2776	unsigned i;
2777
2778	ctor_reg(&reg, P_TEMP, -1, -1);
2779
2780	for (i = 0; i < pc->result_nr * 4; ++i) {
2781		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2782			continue;
2783		if (pc->result[i].rhw != pc->result[i].hw) {
2784			reg.hw = pc->result[i].rhw;
2785			emit_mov(pc, &reg, &pc->result[i]);
2786		}
2787	}
2788}
2789
2790static void
2791nv50_program_fixup_insns(struct nv50_pc *pc)
2792{
2793	struct nv50_program_exec *e, *prev = NULL, **bra_list;
2794	unsigned i, n, pos;
2795
2796	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2797
2798	/* Collect branch instructions, we need to adjust their offsets
2799	 * when converting 32 bit instructions to 64 bit ones
2800	 */
2801	for (n = 0, e = pc->p->exec_head; e; e = e->next)
2802		if (e->param.index >= 0 && !e->param.mask)
2803			bra_list[n++] = e;
2804
2805	/* Make sure we don't have any single 32 bit instructions. */
2806	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2807		pos += is_long(e) ? 2 : 1;
2808
2809		if ((pos & 1) && (!e->next || is_long(e->next))) {
2810			for (i = 0; i < n; ++i)
2811				if (bra_list[i]->param.index >= pos)
2812					bra_list[i]->param.index += 1;
2813			convert_to_long(pc, e);
2814			++pos;
2815		}
2816		if (e->next)
2817			prev = e;
2818	}
2819
2820	assert(!is_immd(pc->p->exec_head));
2821	assert(!is_immd(pc->p->exec_tail));
2822
2823	/* last instruction must be long so it can have the end bit set */
2824	if (!is_long(pc->p->exec_tail)) {
2825		convert_to_long(pc, pc->p->exec_tail);
2826		if (prev)
2827			convert_to_long(pc, prev);
2828	}
2829	assert(!(pc->p->exec_tail->inst[1] & 2));
2830	/* set the end-bit */
2831	pc->p->exec_tail->inst[1] |= 1;
2832
2833	FREE(bra_list);
2834}
2835
2836static boolean
2837nv50_program_tx(struct nv50_program *p)
2838{
2839	struct tgsi_parse_context parse;
2840	struct nv50_pc *pc;
2841	boolean ret;
2842
2843	pc = CALLOC_STRUCT(nv50_pc);
2844	if (!pc)
2845		return FALSE;
2846
2847	ret = ctor_nv50_pc(pc, p);
2848	if (ret == FALSE)
2849		goto out_cleanup;
2850
2851	ret = nv50_program_tx_prep(pc);
2852	if (ret == FALSE)
2853		goto out_cleanup;
2854
2855	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2856	while (!tgsi_parse_end_of_tokens(&parse)) {
2857		const union tgsi_full_token *tok = &parse.FullToken;
2858
2859		/* don't allow half insn/immd on first and last instruction */
2860		pc->allow32 = TRUE;
2861		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2862			pc->allow32 = FALSE;
2863
2864		tgsi_parse_token(&parse);
2865
2866		switch (tok->Token.Type) {
2867		case TGSI_TOKEN_TYPE_INSTRUCTION:
2868			++pc->insn_cur;
2869			ret = nv50_tgsi_insn(pc, tok);
2870			if (ret == FALSE)
2871				goto out_err;
2872			break;
2873		default:
2874			break;
2875		}
2876	}
2877
2878	if (pc->p->type == PIPE_SHADER_FRAGMENT)
2879		nv50_fp_move_results(pc);
2880
2881	nv50_program_fixup_insns(pc);
2882
2883	p->param_nr = pc->param_nr * 4;
2884	p->immd_nr = pc->immd_nr * 4;
2885	p->immd = pc->immd_buf;
2886
2887out_err:
2888	tgsi_parse_free(&parse);
2889
2890out_cleanup:
2891	free_nv50_pc(pc);
2892	return ret;
2893}
2894
2895static void
2896nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2897{
2898	if (nv50_program_tx(p) == FALSE)
2899		assert(0);
2900	p->translated = TRUE;
2901}
2902
2903static void
2904nv50_program_upload_data(struct nv50_context *nv50, float *map,
2905			unsigned start, unsigned count, unsigned cbuf)
2906{
2907	struct nouveau_channel *chan = nv50->screen->base.channel;
2908	struct nouveau_grobj *tesla = nv50->screen->tesla;
2909
2910	while (count) {
2911		unsigned nr = count > 2047 ? 2047 : count;
2912
2913		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2914		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2915		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2916		OUT_RINGp (chan, map, nr);
2917
2918		map += nr;
2919		start += nr;
2920		count -= nr;
2921	}
2922}
2923
2924static void
2925nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2926{
2927	struct pipe_screen *pscreen = nv50->pipe.screen;
2928
2929	if (!p->data[0] && p->immd_nr) {
2930		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2931
2932		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2933			while (heap->next && heap->size < p->immd_nr) {
2934				struct nv50_program *evict = heap->next->priv;
2935				nouveau_resource_free(&evict->data[0]);
2936			}
2937
2938			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2939						   &p->data[0]))
2940				assert(0);
2941		}
2942
2943		/* immediates only need to be uploaded again when freed */
2944		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2945					 p->immd_nr, NV50_CB_PMISC);
2946	}
2947
2948	assert(p->param_nr <= 512);
2949
2950	if (p->param_nr) {
2951		unsigned cb;
2952		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2953					     PIPE_BUFFER_USAGE_CPU_READ);
2954
2955		if (p->type == PIPE_SHADER_VERTEX)
2956			cb = NV50_CB_PVP;
2957		else
2958			cb = NV50_CB_PFP;
2959
2960		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
2961		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2962	}
2963}
2964
2965static void
2966nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2967{
2968	struct nouveau_channel *chan = nv50->screen->base.channel;
2969	struct nouveau_grobj *tesla = nv50->screen->tesla;
2970	struct nv50_program_exec *e;
2971	struct nouveau_stateobj *so;
2972	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2973	unsigned start, count, *up, *ptr;
2974	boolean upload = FALSE;
2975
2976	if (!p->bo) {
2977		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2978			       p->exec_size * 4, &p->bo);
2979		upload = TRUE;
2980	}
2981
2982	if (p->data[0] && p->data[0]->start != p->data_start[0])
2983		upload = TRUE;
2984
2985	if (!upload)
2986		return;
2987
2988	for (e = p->exec_head; e; e = e->next) {
2989		unsigned ei, ci, bs;
2990
2991		if (e->param.index < 0)
2992			continue;
2993
2994		if (e->param.mask == 0) {
2995			assert(!(e->param.index & 1));
2996			/* seem to be 8 byte steps */
2997			ei = (e->param.index >> 1) + 0 /* START_ID */;
2998
2999			e->inst[0] &= 0xf0000fff;
3000			e->inst[0] |= ei << 12;
3001			continue;
3002		}
3003
3004		bs = (e->inst[1] >> 22) & 0x07;
3005		assert(bs < 2);
3006		ei = e->param.shift >> 5;
3007		ci = e->param.index;
3008		if (bs == 0)
3009			ci += p->data[bs]->start;
3010
3011		e->inst[ei] &= ~e->param.mask;
3012		e->inst[ei] |= (ci << e->param.shift);
3013	}
3014
3015	if (p->data[0])
3016		p->data_start[0] = p->data[0]->start;
3017
3018#ifdef NV50_PROGRAM_DUMP
3019	NOUVEAU_ERR("-------\n");
3020	for (e = p->exec_head; e; e = e->next) {
3021		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3022		if (is_long(e))
3023			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3024	}
3025#endif
3026
3027	up = ptr = MALLOC(p->exec_size * 4);
3028	for (e = p->exec_head; e; e = e->next) {
3029		*(ptr++) = e->inst[0];
3030		if (is_long(e))
3031			*(ptr++) = e->inst[1];
3032	}
3033
3034	so = so_new(4,2);
3035	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
3036	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
3037	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
3038	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
3039
3040	start = 0; count = p->exec_size;
3041	while (count) {
3042		struct nouveau_channel *chan = nv50->screen->base.channel;
3043		unsigned nr;
3044
3045		so_emit(chan, so);
3046
3047		nr = MIN2(count, 2047);
3048		nr = MIN2(chan->pushbuf->remaining, nr);
3049		if (chan->pushbuf->remaining < (nr + 3)) {
3050			FIRE_RING(chan);
3051			continue;
3052		}
3053
3054		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
3055		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
3056		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
3057		OUT_RINGp (chan, up + start, nr);
3058
3059		start += nr;
3060		count -= nr;
3061	}
3062
3063	FREE(up);
3064	so_ref(NULL, &so);
3065}
3066
3067void
3068nv50_vertprog_validate(struct nv50_context *nv50)
3069{
3070	struct nouveau_grobj *tesla = nv50->screen->tesla;
3071	struct nv50_program *p = nv50->vertprog;
3072	struct nouveau_stateobj *so;
3073
3074	if (!p->translated) {
3075		nv50_program_validate(nv50, p);
3076		if (!p->translated)
3077			assert(0);
3078	}
3079
3080	nv50_program_validate_data(nv50, p);
3081	nv50_program_validate_code(nv50, p);
3082
3083	so = so_new(13, 2);
3084	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3085	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3086		      NOUVEAU_BO_HIGH, 0, 0);
3087	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3088		      NOUVEAU_BO_LOW, 0, 0);
3089	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3090	so_data  (so, p->cfg.attr[0]);
3091	so_data  (so, p->cfg.attr[1]);
3092	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3093	so_data  (so, p->cfg.high_result);
3094	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3095	so_data  (so, p->cfg.high_result); //8);
3096	so_data  (so, p->cfg.high_temp);
3097	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3098	so_data  (so, 0); /* program start offset */
3099	so_ref(so, &nv50->state.vertprog);
3100	so_ref(NULL, &so);
3101}
3102
3103void
3104nv50_fragprog_validate(struct nv50_context *nv50)
3105{
3106	struct nouveau_grobj *tesla = nv50->screen->tesla;
3107	struct nv50_program *p = nv50->fragprog;
3108	struct nouveau_stateobj *so;
3109
3110	if (!p->translated) {
3111		nv50_program_validate(nv50, p);
3112		if (!p->translated)
3113			assert(0);
3114	}
3115
3116	nv50_program_validate_data(nv50, p);
3117	nv50_program_validate_code(nv50, p);
3118
3119	so = so_new(64, 2);
3120	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3121	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3122		      NOUVEAU_BO_HIGH, 0, 0);
3123	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3124		      NOUVEAU_BO_LOW, 0, 0);
3125	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3126	so_data  (so, p->cfg.high_temp);
3127	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3128	so_data  (so, p->cfg.high_result);
3129	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
3130	so_data  (so, p->cfg.regs[2]);
3131	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3132	so_data  (so, p->cfg.regs[3]);
3133	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3134	so_data  (so, 0); /* program start offset */
3135	so_ref(so, &nv50->state.fragprog);
3136	so_ref(NULL, &so);
3137}
3138
3139static void
3140nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3141{
3142	struct nv50_program *fp = nv50->fragprog;
3143	struct nv50_program *vp = nv50->vertprog;
3144	unsigned i, c, m = base;
3145
3146	/* XXX: This can't work correctly in all cases yet, we either
3147	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
3148	 * to be per FP input instead of per VP output
3149	 */
3150	memset(pntc, 0, 8 * sizeof(uint32_t));
3151
3152	for (i = 0; i < fp->cfg.io_nr; i++) {
3153		uint8_t sn, si;
3154		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
3155		unsigned n = popcnt4(fp->cfg.io[i].mask);
3156
3157		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3158			m += n;
3159			continue;
3160		}
3161
3162		sn = vp->info.input_semantic_name[j];
3163		si = vp->info.input_semantic_index[j];
3164
3165		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
3166			ubyte mode =
3167				nv50->rasterizer->pipe.sprite_coord_mode[si];
3168
3169			if (mode == PIPE_SPRITE_COORD_NONE) {
3170				m += n;
3171				continue;
3172			}
3173		}
3174
3175		/* this is either PointCoord or replaced by sprite coords */
3176		for (c = 0; c < 4; c++) {
3177			if (!(fp->cfg.io[i].mask & (1 << c)))
3178				continue;
3179			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3180			++m;
3181		}
3182	}
3183}
3184
3185static int
3186nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3187	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3188{
3189	int c;
3190	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3191	uint8_t *map = (uint8_t *)p_map;
3192
3193	for (c = 0; c < 4; ++c) {
3194		if (mf & 1) {
3195			if (fpi->linear == TRUE)
3196				lin[mid / 32] |= 1 << (mid % 32);
3197			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3198		}
3199
3200		oid += mv & 1;
3201		mf >>= 1;
3202		mv >>= 1;
3203	}
3204
3205	return mid;
3206}
3207
3208void
3209nv50_linkage_validate(struct nv50_context *nv50)
3210{
3211	struct nouveau_grobj *tesla = nv50->screen->tesla;
3212	struct nv50_program *vp = nv50->vertprog;
3213	struct nv50_program *fp = nv50->fragprog;
3214	struct nouveau_stateobj *so;
3215	struct nv50_sreg4 dummy, *vpo;
3216	int i, n, c, m = 0;
3217	uint32_t map[16], lin[4], reg[5], pcrd[8];
3218
3219	memset(map, 0, sizeof(map));
3220	memset(lin, 0, sizeof(lin));
3221
3222	reg[1] = 0x00000004; /* low and high clip distance map ids */
3223	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3224	reg[3] = 0x00000000; /* point size map id & enable */
3225	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3226	reg[4] = fp->cfg.regs[1]; /* interpolant info */
3227
3228	dummy.linear = FALSE;
3229	dummy.mask = 0xf; /* map all components of HPOS */
3230	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3231
3232	dummy.mask = 0x0;
3233
3234	if (vp->cfg.clpd < 0x40) {
3235		for (c = 0; c < vp->cfg.clpd_nr; ++c)
3236			map[m++] = vp->cfg.clpd + c;
3237		reg[1] = (m << 8);
3238	}
3239
3240	reg[0] |= m << 8; /* adjust BFC0 id */
3241
3242	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3243	if (nv50->rasterizer->pipe.light_twoside) {
3244		vpo = &vp->cfg.two_side[0];
3245
3246		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3247		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3248	}
3249
3250	reg[0] += m - 4; /* adjust FFC0 id */
3251	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3252
3253	i = 0;
3254	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
3255		i = 1;
3256	for (; i < fp->cfg.io_nr; i++) {
3257		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
3258		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
3259
3260		n = fp->cfg.io[i].id_vp;
3261		if (n >= vp->cfg.io_nr ||
3262		    vp->info.output_semantic_name[n] != sn ||
3263		    vp->info.output_semantic_index[n] != si)
3264			vpo = &dummy;
3265		else
3266			vpo = &vp->cfg.io[n];
3267
3268		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3269	}
3270
3271	if (nv50->rasterizer->pipe.point_size_per_vertex) {
3272		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3273		reg[3] = (m++ << 4) | 1;
3274	}
3275
3276	/* now fill the stateobj */
3277	so = so_new(64, 0);
3278
3279	n = (m + 3) / 4;
3280	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3281	so_data  (so, m);
3282	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3283	so_datap (so, map, n);
3284
3285	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3286	so_datap (so, reg, 4);
3287
3288	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3289	so_data  (so, reg[4]);
3290
3291	so_method(so, tesla, 0x1540, 4);
3292	so_datap (so, lin, 4);
3293
3294	if (nv50->rasterizer->pipe.point_sprite) {
3295		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3296
3297		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3298		so_datap (so, pcrd, 8);
3299	}
3300
3301        so_ref(so, &nv50->state.programs);
3302        so_ref(NULL, &so);
3303}
3304
3305void
3306nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3307{
3308	while (p->exec_head) {
3309		struct nv50_program_exec *e = p->exec_head;
3310
3311		p->exec_head = e->next;
3312		FREE(e);
3313	}
3314	p->exec_tail = NULL;
3315	p->exec_size = 0;
3316
3317	nouveau_bo_ref(NULL, &p->bo);
3318
3319	nouveau_resource_free(&p->data[0]);
3320
3321	p->translated = 0;
3322}
3323