nv50_program.c revision 4d2551beb7b3f5ae9f47ee97e24556c5bcb905c8
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 127
35#define NV50_SU_MAX_ADDR 4
36//#define NV50_PROGRAM_DUMP
37
38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40/* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * 	- Fuck it off, introduce a way to negate args for ops that
44 * 	  support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * 	ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * 	FP attr/result assignment - how?
61 * 		attrib
62 * 			- 0x16bc maps vp output onto fp hpos
63 * 			- 0x16c0 maps vp output onto fp col0
64 * 		result
65 * 			- colr always 0-3
66 * 			- depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * 	      "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * 	- 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * 	- XX == FP high something
78 */
79struct nv50_reg {
80	enum {
81		P_TEMP,
82		P_ATTR,
83		P_RESULT,
84		P_CONST,
85		P_IMMD,
86		P_ADDR
87	} type;
88	int index;
89
90	int hw;
91	int mod;
92
93	int rhw; /* result hw for FP outputs, or interpolant index */
94	int acc; /* instruction where this reg is last read (first insn == 1) */
95};
96
97#define NV50_MOD_NEG 1
98#define NV50_MOD_ABS 2
99#define NV50_MOD_SAT 4
100
101/* STACK: Conditionals and loops have to use the (per warp) stack.
102 * Stack entries consist of an entry type (divergent path, join at),
103 * a mask indicating the active threads of the warp, and an address.
104 * MPs can store 12 stack entries internally, if we need more (and
105 * we probably do), we have to create a stack buffer in VRAM.
106 */
107/* impose low limits for now */
108#define NV50_MAX_COND_NESTING 4
109#define NV50_MAX_LOOP_NESTING 3
110
111#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2
112
113struct nv50_pc {
114	struct nv50_program *p;
115
116	/* hw resources */
117	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
118	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
119
120	/* tgsi resources */
121	struct nv50_reg *temp;
122	int temp_nr;
123	struct nv50_reg *attr;
124	int attr_nr;
125	struct nv50_reg *result;
126	int result_nr;
127	struct nv50_reg *param;
128	int param_nr;
129	struct nv50_reg *immd;
130	uint32_t *immd_buf;
131	int immd_nr;
132	struct nv50_reg **addr;
133	int addr_nr;
134	uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */
135
136	struct nv50_reg *temp_temp[16];
137	unsigned temp_temp_nr;
138
139	/* broadcast and destination replacement regs */
140	struct nv50_reg *r_brdc;
141	struct nv50_reg *r_dst[4];
142
143	struct nv50_reg reg_instances[16];
144	unsigned reg_instance_nr;
145
146	unsigned interp_mode[32];
147	/* perspective interpolation registers */
148	struct nv50_reg *iv_p;
149	struct nv50_reg *iv_c;
150
151	struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING];
152	struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING];
153	struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING];
154	int if_lvl, loop_lvl;
155	unsigned loop_pos[NV50_MAX_LOOP_NESTING];
156
157	/* current instruction and total number of insns */
158	unsigned insn_cur;
159	unsigned insn_nr;
160
161	boolean allow32;
162
163	uint8_t edgeflag_out;
164};
165
166static INLINE struct nv50_reg *
167reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
168{
169	struct nv50_reg *ri;
170
171	assert(pc->reg_instance_nr < 16);
172	ri = &pc->reg_instances[pc->reg_instance_nr++];
173	if (reg) {
174		*ri = *reg;
175		reg->mod = 0;
176	}
177	return ri;
178}
179
180static INLINE void
181ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
182{
183	reg->type = type;
184	reg->index = index;
185	reg->hw = hw;
186	reg->mod = 0;
187	reg->rhw = -1;
188	reg->acc = 0;
189}
190
191static INLINE unsigned
192popcnt4(uint32_t val)
193{
194	static const unsigned cnt[16]
195	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
196	return cnt[val & 0xf];
197}
198
199static void
200terminate_mbb(struct nv50_pc *pc)
201{
202	int i;
203
204	/* remove records of temporary address register values */
205	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
206		pc->r_addr[i].rhw = -1;
207}
208
209static void
210alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
211{
212	int i = 0;
213
214	if (reg->type == P_RESULT) {
215		if (pc->p->cfg.high_result < (reg->hw + 1))
216			pc->p->cfg.high_result = reg->hw + 1;
217	}
218
219	if (reg->type != P_TEMP)
220		return;
221
222	if (reg->hw >= 0) {
223		/*XXX: do this here too to catch FP temp-as-attr usage..
224		 *     not clean, but works */
225		if (pc->p->cfg.high_temp < (reg->hw + 1))
226			pc->p->cfg.high_temp = reg->hw + 1;
227		return;
228	}
229
230	if (reg->rhw != -1) {
231		/* try to allocate temporary with index rhw first */
232		if (!(pc->r_temp[reg->rhw])) {
233			pc->r_temp[reg->rhw] = reg;
234			reg->hw = reg->rhw;
235			if (pc->p->cfg.high_temp < (reg->rhw + 1))
236				pc->p->cfg.high_temp = reg->rhw + 1;
237			return;
238		}
239		/* make sure we don't get things like $r0 needs to go
240		 * in $r1 and $r1 in $r0
241		 */
242		i = pc->result_nr * 4;
243	}
244
245	for (; i < NV50_SU_MAX_TEMP; i++) {
246		if (!(pc->r_temp[i])) {
247			pc->r_temp[i] = reg;
248			reg->hw = i;
249			if (pc->p->cfg.high_temp < (i + 1))
250				pc->p->cfg.high_temp = i + 1;
251			return;
252		}
253	}
254
255	assert(0);
256}
257
258/* XXX: For shaders that aren't executed linearly (e.g. shaders that
259 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
260 * lest we risk temp_temps overwriting regs alloc'd "later".
261 */
262static struct nv50_reg *
263alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
264{
265	struct nv50_reg *r;
266	int i;
267
268	if (dst && dst->type == P_TEMP && dst->hw == -1)
269		return dst;
270
271	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
272		if (!pc->r_temp[i]) {
273			r = MALLOC_STRUCT(nv50_reg);
274			ctor_reg(r, P_TEMP, -1, i);
275			pc->r_temp[i] = r;
276			return r;
277		}
278	}
279
280	assert(0);
281	return NULL;
282}
283
284/* Assign the hw of the discarded temporary register src
285 * to the tgsi register dst and free src.
286 */
287static void
288assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
289{
290	assert(src->index == -1 && src->hw != -1);
291
292	if (dst->hw != -1)
293		pc->r_temp[dst->hw] = NULL;
294	pc->r_temp[src->hw] = dst;
295	dst->hw = src->hw;
296
297	FREE(src);
298}
299
300/* release the hardware resource held by r */
301static void
302release_hw(struct nv50_pc *pc, struct nv50_reg *r)
303{
304	assert(r->type == P_TEMP);
305	if (r->hw == -1)
306		return;
307
308	assert(pc->r_temp[r->hw] == r);
309	pc->r_temp[r->hw] = NULL;
310
311	r->acc = 0;
312	if (r->index == -1)
313		FREE(r);
314}
315
316static void
317free_temp(struct nv50_pc *pc, struct nv50_reg *r)
318{
319	if (r->index == -1) {
320		unsigned hw = r->hw;
321
322		FREE(pc->r_temp[hw]);
323		pc->r_temp[hw] = NULL;
324	}
325}
326
327static int
328alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
329{
330	int i;
331
332	if ((idx + 4) >= NV50_SU_MAX_TEMP)
333		return 1;
334
335	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
336	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
337		return alloc_temp4(pc, dst, idx + 4);
338
339	for (i = 0; i < 4; i++) {
340		dst[i] = MALLOC_STRUCT(nv50_reg);
341		ctor_reg(dst[i], P_TEMP, -1, idx + i);
342		pc->r_temp[idx + i] = dst[i];
343	}
344
345	return 0;
346}
347
348static void
349free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
350{
351	int i;
352
353	for (i = 0; i < 4; i++)
354		free_temp(pc, reg[i]);
355}
356
357static struct nv50_reg *
358temp_temp(struct nv50_pc *pc)
359{
360	if (pc->temp_temp_nr >= 16)
361		assert(0);
362
363	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
364	return pc->temp_temp[pc->temp_temp_nr++];
365}
366
367static void
368kill_temp_temp(struct nv50_pc *pc)
369{
370	int i;
371
372	for (i = 0; i < pc->temp_temp_nr; i++)
373		free_temp(pc, pc->temp_temp[i]);
374	pc->temp_temp_nr = 0;
375}
376
377static int
378ctor_immd_4u32(struct nv50_pc *pc,
379	       uint32_t x, uint32_t y, uint32_t z, uint32_t w)
380{
381	unsigned size = pc->immd_nr * 4 * sizeof(uint32_t);
382
383	pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t));
384
385	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
386	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
387	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
388	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
389
390	return pc->immd_nr++;
391}
392
393static INLINE int
394ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w)
395{
396	return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w));
397}
398
399static struct nv50_reg *
400alloc_immd(struct nv50_pc *pc, float f)
401{
402	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
403	unsigned hw;
404
405	for (hw = 0; hw < pc->immd_nr * 4; hw++)
406		if (pc->immd_buf[hw] == fui(f))
407			break;
408
409	if (hw == pc->immd_nr * 4)
410		hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4;
411
412	ctor_reg(r, P_IMMD, -1, hw);
413	return r;
414}
415
416static struct nv50_program_exec *
417exec(struct nv50_pc *pc)
418{
419	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
420
421	e->param.index = -1;
422	return e;
423}
424
425static void
426emit(struct nv50_pc *pc, struct nv50_program_exec *e)
427{
428	struct nv50_program *p = pc->p;
429
430	if (p->exec_tail)
431		p->exec_tail->next = e;
432	if (!p->exec_head)
433		p->exec_head = e;
434	p->exec_tail = e;
435	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
436}
437
438static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
439
440static boolean
441is_long(struct nv50_program_exec *e)
442{
443	if (e->inst[0] & 1)
444		return TRUE;
445	return FALSE;
446}
447
448static boolean
449is_immd(struct nv50_program_exec *e)
450{
451	if (is_long(e) && (e->inst[1] & 3) == 3)
452		return TRUE;
453	return FALSE;
454}
455
456static INLINE void
457set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
458	 struct nv50_program_exec *e)
459{
460	set_long(pc, e);
461	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
462	e->inst[1] |= (pred << 7) | (idx << 12);
463}
464
465static INLINE void
466set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
467	    struct nv50_program_exec *e)
468{
469	set_long(pc, e);
470	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
471	e->inst[1] |= (idx << 4) | (on << 6);
472}
473
474static INLINE void
475set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
476{
477	if (is_long(e))
478		return;
479
480	e->inst[0] |= 1;
481	set_pred(pc, 0xf, 0, e);
482	set_pred_wr(pc, 0, 0, e);
483}
484
485static INLINE void
486set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
487{
488	if (dst->type == P_RESULT) {
489		set_long(pc, e);
490		e->inst[1] |= 0x00000008;
491	}
492
493	alloc_reg(pc, dst);
494	if (dst->hw > 63)
495		set_long(pc, e);
496	e->inst[0] |= (dst->hw << 2);
497}
498
499static INLINE void
500set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
501{
502	set_long(pc, e);
503	/* XXX: can't be predicated - bits overlap; cases where both
504	 * are required should be avoided by using pc->allow32 */
505	set_pred(pc, 0, 0, e);
506	set_pred_wr(pc, 0, 0, e);
507
508	e->inst[1] |= 0x00000002 | 0x00000001;
509	e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16;
510	e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2;
511}
512
513static INLINE void
514set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
515{
516	assert(!(e->inst[0] & 0x0c000000));
517	assert(!(e->inst[1] & 0x00000004));
518
519	e->inst[0] |= (a->hw & 3) << 26;
520	e->inst[1] |= (a->hw >> 2) << 2;
521}
522
523static void
524emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
525		  struct nv50_reg *src0, uint16_t src1_val)
526{
527	struct nv50_program_exec *e = exec(pc);
528
529	e->inst[0] = 0xd0000000 | (src1_val << 9);
530	e->inst[1] = 0x20000000;
531	set_long(pc, e);
532	e->inst[0] |= dst->hw << 2;
533	if (src0) /* otherwise will add to $a0, which is always 0 */
534		set_addr(e, src0);
535
536	emit(pc, e);
537}
538
539static struct nv50_reg *
540alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
541{
542	struct nv50_reg *a_tgsi = NULL, *a = NULL;
543	int i;
544	uint8_t avail = ~pc->addr_alloc;
545
546	if (!ref) {
547		/* allocate for TGSI_FILE_ADDRESS */
548		while (avail) {
549			i = ffs(avail) - 1;
550
551			if (pc->r_addr[i].rhw < 0 ||
552			    pc->r_addr[i].acc != pc->insn_cur) {
553				pc->addr_alloc |= (1 << i);
554
555				pc->r_addr[i].rhw = -1;
556				pc->r_addr[i].index = i;
557				return &pc->r_addr[i];
558			}
559			avail &= ~(1 << i);
560		}
561		assert(0);
562		return NULL;
563	}
564
565	/* Allocate and set an address reg so we can access 'ref'.
566	 *
567	 * If and r_addr->index will be -1 or the hw index the value
568	 * value in rhw is relative to. If rhw < 0, the reg has not
569	 * been initialized or is in use for TGSI_FILE_ADDRESS.
570	 */
571	while (avail) { /* only consider regs that are not TGSI */
572		i = ffs(avail) - 1;
573		avail &= ~(1 << i);
574
575		if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) {
576			/* prefer an usused reg with low hw index */
577			a = &pc->r_addr[i];
578			continue;
579		}
580		if (!a && pc->r_addr[i].acc != pc->insn_cur)
581			a = &pc->r_addr[i];
582
583		if (ref->hw - pc->r_addr[i].rhw >= 128)
584			continue;
585
586		if ((ref->acc >= 0 && pc->r_addr[i].index < 0) ||
587		    (ref->acc < 0 && pc->r_addr[i].index == ref->index)) {
588			pc->r_addr[i].acc = pc->insn_cur;
589			return &pc->r_addr[i];
590		}
591	}
592	assert(a);
593
594	if (ref->acc < 0)
595		a_tgsi = pc->addr[ref->index];
596
597	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
598
599	a->rhw = ref->hw & ~0x7f;
600	a->acc = pc->insn_cur;
601	a->index = a_tgsi ? ref->index : -1;
602	return a;
603}
604
605#define INTERP_LINEAR		0
606#define INTERP_FLAT		1
607#define INTERP_PERSPECTIVE	2
608#define INTERP_CENTROID		4
609
610/* interpolant index has been stored in dst->rhw */
611static void
612emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
613		unsigned mode)
614{
615	assert(dst->rhw != -1);
616	struct nv50_program_exec *e = exec(pc);
617
618	e->inst[0] |= 0x80000000;
619	set_dst(pc, dst, e);
620	e->inst[0] |= (dst->rhw << 16);
621
622	if (mode & INTERP_FLAT) {
623		e->inst[0] |= (1 << 8);
624	} else {
625		if (mode & INTERP_PERSPECTIVE) {
626			e->inst[0] |= (1 << 25);
627			alloc_reg(pc, iv);
628			e->inst[0] |= (iv->hw << 9);
629		}
630
631		if (mode & INTERP_CENTROID)
632			e->inst[0] |= (1 << 24);
633	}
634
635	emit(pc, e);
636}
637
638static void
639set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
640	 struct nv50_program_exec *e)
641{
642	set_long(pc, e);
643
644	e->param.index = src->hw & 127;
645	e->param.shift = s;
646	e->param.mask = m << (s % 32);
647
648	if (src->hw > 127)
649		set_addr(e, alloc_addr(pc, src));
650	else
651	if (src->acc < 0) {
652		assert(src->type == P_CONST);
653		set_addr(e, pc->addr[src->index]);
654	}
655
656	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
657}
658
659/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */
660static void
661emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
662{
663	struct nv50_program_exec *e = exec(pc);
664
665	e->inst[0] = 0x10000000;
666	if (!pc->allow32)
667		set_long(pc, e);
668
669	set_dst(pc, dst, e);
670
671	if (!is_long(e) && src->type == P_IMMD) {
672		set_immd(pc, src, e);
673		/*XXX: 32-bit, but steals part of "half" reg space - need to
674		 *     catch and handle this case if/when we do half-regs
675		 */
676	} else
677	if (src->type == P_IMMD || src->type == P_CONST) {
678		set_long(pc, e);
679		set_data(pc, src, 0x7f, 9, e);
680		e->inst[1] |= 0x20000000; /* mov from c[] */
681	} else {
682		if (src->type == P_ATTR) {
683			set_long(pc, e);
684			e->inst[1] |= 0x00200000;
685		}
686
687		alloc_reg(pc, src);
688		if (src->hw > 63)
689			set_long(pc, e);
690		e->inst[0] |= (src->hw << 9);
691	}
692
693	if (is_long(e) && !is_immd(e)) {
694		e->inst[1] |= 0x04000000; /* 32-bit */
695		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
696		if (!(e->inst[1] & 0x20000000))
697			e->inst[1] |= 0x00030000; /* lane mask 2:3 */
698	} else
699		e->inst[0] |= 0x00008000;
700
701	emit(pc, e);
702}
703
704static INLINE void
705emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
706{
707	struct nv50_reg *imm = alloc_immd(pc, f);
708	emit_mov(pc, dst, imm);
709	FREE(imm);
710}
711
712static void
713emit_nop(struct nv50_pc *pc)
714{
715	struct nv50_program_exec *e = exec(pc);
716
717	e->inst[0] = 0xf0000000;
718	set_long(pc, e);
719	e->inst[1] = 0xe0000000;
720	emit(pc, e);
721}
722
723static boolean
724check_swap_src_0_1(struct nv50_pc *pc,
725		   struct nv50_reg **s0, struct nv50_reg **s1)
726{
727	struct nv50_reg *src0 = *s0, *src1 = *s1;
728
729	if (src0->type == P_CONST) {
730		if (src1->type != P_CONST) {
731			*s0 = src1;
732			*s1 = src0;
733			return TRUE;
734		}
735	} else
736	if (src1->type == P_ATTR) {
737		if (src0->type != P_ATTR) {
738			*s0 = src1;
739			*s1 = src0;
740			return TRUE;
741		}
742	}
743
744	return FALSE;
745}
746
747static void
748set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
749		     struct nv50_program_exec *e)
750{
751	struct nv50_reg *temp;
752
753	if (src->type != P_TEMP) {
754		temp = temp_temp(pc);
755		emit_mov(pc, temp, src);
756		src = temp;
757	}
758
759	alloc_reg(pc, src);
760	if (src->hw > 63)
761		set_long(pc, e);
762	e->inst[0] |= (src->hw << 9);
763}
764
765static void
766set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
767{
768	if (src->type == P_ATTR) {
769		set_long(pc, e);
770		e->inst[1] |= 0x00200000;
771	} else
772	if (src->type == P_CONST || src->type == P_IMMD) {
773		struct nv50_reg *temp = temp_temp(pc);
774
775		emit_mov(pc, temp, src);
776		src = temp;
777	}
778
779	alloc_reg(pc, src);
780	if (src->hw > 63)
781		set_long(pc, e);
782	e->inst[0] |= (src->hw << 9);
783}
784
785static void
786set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
787{
788	if (src->type == P_ATTR) {
789		struct nv50_reg *temp = temp_temp(pc);
790
791		emit_mov(pc, temp, src);
792		src = temp;
793	} else
794	if (src->type == P_CONST || src->type == P_IMMD) {
795		assert(!(e->inst[0] & 0x00800000));
796		if (e->inst[0] & 0x01000000) {
797			struct nv50_reg *temp = temp_temp(pc);
798
799			emit_mov(pc, temp, src);
800			src = temp;
801		} else {
802			set_data(pc, src, 0x7f, 16, e);
803			e->inst[0] |= 0x00800000;
804		}
805	}
806
807	alloc_reg(pc, src);
808	if (src->hw > 63)
809		set_long(pc, e);
810	e->inst[0] |= ((src->hw & 127) << 16);
811}
812
813static void
814set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
815{
816	set_long(pc, e);
817
818	if (src->type == P_ATTR) {
819		struct nv50_reg *temp = temp_temp(pc);
820
821		emit_mov(pc, temp, src);
822		src = temp;
823	} else
824	if (src->type == P_CONST || src->type == P_IMMD) {
825		assert(!(e->inst[0] & 0x01000000));
826		if (e->inst[0] & 0x00800000) {
827			struct nv50_reg *temp = temp_temp(pc);
828
829			emit_mov(pc, temp, src);
830			src = temp;
831		} else {
832			set_data(pc, src, 0x7f, 32+14, e);
833			e->inst[0] |= 0x01000000;
834		}
835	}
836
837	alloc_reg(pc, src);
838	e->inst[1] |= ((src->hw & 127) << 14);
839}
840
841static void
842emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
843{
844	struct nv50_program_exec *e = exec(pc);
845
846	assert(dst->type == P_TEMP);
847	e->inst[1] = 0x20000000 | (pred << 12);
848	set_long(pc, e);
849	set_dst(pc, dst, e);
850
851	emit(pc, e);
852}
853
854static void
855emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
856{
857	struct nv50_program_exec *e = exec(pc);
858
859	e->inst[0] = 0x000001fc;
860	e->inst[1] = 0xa0000008;
861	set_long(pc, e);
862	set_pred_wr(pc, 1, pred, e);
863	set_src_0_restricted(pc, src, e);
864
865	emit(pc, e);
866}
867
868static void
869emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
870	 struct nv50_reg *src1)
871{
872	struct nv50_program_exec *e = exec(pc);
873
874	e->inst[0] |= 0xc0000000;
875
876	if (!pc->allow32)
877		set_long(pc, e);
878
879	check_swap_src_0_1(pc, &src0, &src1);
880	set_dst(pc, dst, e);
881	set_src_0(pc, src0, e);
882	if (src1->type == P_IMMD && !is_long(e)) {
883		if (src0->mod ^ src1->mod)
884			e->inst[0] |= 0x00008000;
885		set_immd(pc, src1, e);
886	} else {
887		set_src_1(pc, src1, e);
888		if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
889			if (is_long(e))
890				e->inst[1] |= 0x08000000;
891			else
892				e->inst[0] |= 0x00008000;
893		}
894	}
895
896	emit(pc, e);
897}
898
899static void
900emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
901	 struct nv50_reg *src0, struct nv50_reg *src1)
902{
903	struct nv50_program_exec *e = exec(pc);
904
905	e->inst[0] = 0xb0000000;
906
907	alloc_reg(pc, src1);
908	check_swap_src_0_1(pc, &src0, &src1);
909
910	if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
911		set_long(pc, e);
912		e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
913			      ((src1->mod & NV50_MOD_NEG) << 27);
914	}
915
916	set_dst(pc, dst, e);
917	set_src_0(pc, src0, e);
918	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
919		set_src_2(pc, src1, e);
920	else
921	if (src1->type == P_IMMD)
922		set_immd(pc, src1, e);
923	else
924		set_src_1(pc, src1, e);
925
926	emit(pc, e);
927}
928
929static void
930emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
931	 uint8_t s)
932{
933	struct nv50_program_exec *e = exec(pc);
934
935	set_long(pc, e);
936	e->inst[1] |= 0xc0000000;
937
938	e->inst[0] |= dst->hw << 2;
939	e->inst[0] |= s << 16; /* shift left */
940	set_src_0_restricted(pc, src, e);
941
942	emit(pc, e);
943}
944
945static void
946emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
947	    struct nv50_reg *src0, struct nv50_reg *src1)
948{
949	struct nv50_program_exec *e = exec(pc);
950
951	set_long(pc, e);
952	e->inst[0] |= 0xb0000000;
953	e->inst[1] |= (sub << 29);
954
955	check_swap_src_0_1(pc, &src0, &src1);
956	set_dst(pc, dst, e);
957	set_src_0(pc, src0, e);
958	set_src_1(pc, src1, e);
959
960	if (src0->mod & NV50_MOD_ABS)
961		e->inst[1] |= 0x00100000;
962	if (src1->mod & NV50_MOD_ABS)
963		e->inst[1] |= 0x00080000;
964
965	emit(pc, e);
966}
967
968static INLINE void
969emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
970	 struct nv50_reg *src1)
971{
972	src1->mod ^= NV50_MOD_NEG;
973	emit_add(pc, dst, src0, src1);
974	src1->mod ^= NV50_MOD_NEG;
975}
976
977static void
978emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
979	    struct nv50_reg *src1, unsigned op)
980{
981	struct nv50_program_exec *e = exec(pc);
982
983	e->inst[0] = 0xd0000000;
984	set_long(pc, e);
985
986	check_swap_src_0_1(pc, &src0, &src1);
987	set_dst(pc, dst, e);
988	set_src_0(pc, src0, e);
989
990	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
991	    op != TGSI_OPCODE_XOR)
992		assert(!"invalid bit op");
993
994	assert(!(src0->mod | src1->mod));
995
996	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
997		set_immd(pc, src1, e);
998		if (op == TGSI_OPCODE_OR)
999			e->inst[0] |= 0x0100;
1000		else
1001		if (op == TGSI_OPCODE_XOR)
1002			e->inst[0] |= 0x8000;
1003	} else {
1004		set_src_1(pc, src1, e);
1005		e->inst[1] |= 0x04000000; /* 32 bit */
1006		if (op == TGSI_OPCODE_OR)
1007			e->inst[1] |= 0x4000;
1008		else
1009		if (op == TGSI_OPCODE_XOR)
1010			e->inst[1] |= 0x8000;
1011	}
1012
1013	emit(pc, e);
1014}
1015
1016static void
1017emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
1018	 struct nv50_reg *src1, struct nv50_reg *src2)
1019{
1020	struct nv50_program_exec *e = exec(pc);
1021
1022	e->inst[0] |= 0xe0000000;
1023
1024	check_swap_src_0_1(pc, &src0, &src1);
1025	set_dst(pc, dst, e);
1026	set_src_0(pc, src0, e);
1027	set_src_1(pc, src1, e);
1028	set_src_2(pc, src2, e);
1029
1030	if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
1031		e->inst[1] |= 0x04000000;
1032	if (src2->mod & NV50_MOD_NEG)
1033		e->inst[1] |= 0x08000000;
1034
1035	emit(pc, e);
1036}
1037
1038static INLINE void
1039emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
1040	 struct nv50_reg *src1, struct nv50_reg *src2)
1041{
1042	src2->mod ^= NV50_MOD_NEG;
1043	emit_mad(pc, dst, src0, src1, src2);
1044	src2->mod ^= NV50_MOD_NEG;
1045}
1046
1047#define NV50_FLOP_RCP 0
1048#define NV50_FLOP_RSQ 2
1049#define NV50_FLOP_LG2 3
1050#define NV50_FLOP_SIN 4
1051#define NV50_FLOP_COS 5
1052#define NV50_FLOP_EX2 6
1053
1054/* rcp, rsqrt, lg2 support neg and abs */
1055static void
1056emit_flop(struct nv50_pc *pc, unsigned sub,
1057	  struct nv50_reg *dst, struct nv50_reg *src)
1058{
1059	struct nv50_program_exec *e = exec(pc);
1060
1061	e->inst[0] |= 0x90000000;
1062	if (sub || src->mod) {
1063		set_long(pc, e);
1064		e->inst[1] |= (sub << 29);
1065	}
1066
1067	set_dst(pc, dst, e);
1068	set_src_0_restricted(pc, src, e);
1069
1070	assert(!src->mod || sub < 4);
1071
1072	if (src->mod & NV50_MOD_NEG)
1073		e->inst[1] |= 0x04000000;
1074	if (src->mod & NV50_MOD_ABS)
1075		e->inst[1] |= 0x00100000;
1076
1077	emit(pc, e);
1078}
1079
1080static void
1081emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1082{
1083	struct nv50_program_exec *e = exec(pc);
1084
1085	e->inst[0] |= 0xb0000000;
1086
1087	set_dst(pc, dst, e);
1088	set_src_0(pc, src, e);
1089	set_long(pc, e);
1090	e->inst[1] |= (6 << 29) | 0x00004000;
1091
1092	if (src->mod & NV50_MOD_NEG)
1093		e->inst[1] |= 0x04000000;
1094	if (src->mod & NV50_MOD_ABS)
1095		e->inst[1] |= 0x00100000;
1096
1097	emit(pc, e);
1098}
1099
1100static void
1101emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1102{
1103	struct nv50_program_exec *e = exec(pc);
1104
1105	e->inst[0] |= 0xb0000000;
1106
1107	set_dst(pc, dst, e);
1108	set_src_0(pc, src, e);
1109	set_long(pc, e);
1110	e->inst[1] |= (6 << 29);
1111
1112	if (src->mod & NV50_MOD_NEG)
1113		e->inst[1] |= 0x04000000;
1114	if (src->mod & NV50_MOD_ABS)
1115		e->inst[1] |= 0x00100000;
1116
1117	emit(pc, e);
1118}
1119
1120#define CVTOP_RN	0x01
1121#define CVTOP_FLOOR	0x03
1122#define CVTOP_CEIL	0x05
1123#define CVTOP_TRUNC	0x07
1124#define CVTOP_SAT	0x08
1125#define CVTOP_ABS	0x10
1126
1127/* 0x04 == 32 bit dst */
1128/* 0x40 == dst is float */
1129/* 0x80 == src is float */
1130#define CVT_F32_F32 0xc4
1131#define CVT_F32_S32 0x44
1132#define CVT_S32_F32 0x8c
1133#define CVT_S32_S32 0x0c
1134#define CVT_NEG     0x20
1135#define CVT_RI      0x08
1136
1137static void
1138emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
1139	 int wp, unsigned cvn, unsigned fmt)
1140{
1141	struct nv50_program_exec *e;
1142
1143	e = exec(pc);
1144	set_long(pc, e);
1145
1146	e->inst[0] |= 0xa0000000;
1147	e->inst[1] |= 0x00004000; /* 32 bit src */
1148	e->inst[1] |= (cvn << 16);
1149	e->inst[1] |= (fmt << 24);
1150	set_src_0(pc, src, e);
1151
1152	if (wp >= 0)
1153		set_pred_wr(pc, 1, wp, e);
1154
1155	if (dst)
1156		set_dst(pc, dst, e);
1157	else {
1158		e->inst[0] |= 0x000001fc;
1159		e->inst[1] |= 0x00000008;
1160	}
1161
1162	emit(pc, e);
1163}
1164
1165/* nv50 Condition codes:
1166 *  0x1 = LT
1167 *  0x2 = EQ
1168 *  0x3 = LE
1169 *  0x4 = GT
1170 *  0x5 = NE
1171 *  0x6 = GE
1172 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1173 *  0x8 = unordered bit (allows NaN)
1174 */
1175static void
1176emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1177	 struct nv50_reg *src0, struct nv50_reg *src1)
1178{
1179	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1180
1181	struct nv50_program_exec *e = exec(pc);
1182	struct nv50_reg *rdst;
1183
1184	assert(ccode < 16);
1185	if (check_swap_src_0_1(pc, &src0, &src1))
1186		ccode = cc_swapped[ccode & 7] | (ccode & 8);
1187
1188	rdst = dst;
1189	if (dst && dst->type != P_TEMP)
1190		dst = alloc_temp(pc, NULL);
1191
1192	/* set.u32 */
1193	set_long(pc, e);
1194	e->inst[0] |= 0xb0000000;
1195	e->inst[1] |= 0x60000000 | (ccode << 14);
1196
1197	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1198	 * that doesn't seem to match what the hw actually does
1199	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1200	 */
1201
1202	if (wp >= 0)
1203		set_pred_wr(pc, 1, wp, e);
1204	if (dst)
1205		set_dst(pc, dst, e);
1206	else {
1207		e->inst[0] |= 0x000001fc;
1208		e->inst[1] |= 0x00000008;
1209	}
1210
1211	set_src_0(pc, src0, e);
1212	set_src_1(pc, src1, e);
1213
1214	emit(pc, e);
1215
1216	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1217	if (rdst)
1218		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1219	if (rdst && rdst != dst)
1220		free_temp(pc, dst);
1221}
1222
1223static INLINE unsigned
1224map_tgsi_setop_cc(unsigned op)
1225{
1226	switch (op) {
1227	case TGSI_OPCODE_SLT: return 0x1;
1228	case TGSI_OPCODE_SGE: return 0x6;
1229	case TGSI_OPCODE_SEQ: return 0x2;
1230	case TGSI_OPCODE_SGT: return 0x4;
1231	case TGSI_OPCODE_SLE: return 0x3;
1232	case TGSI_OPCODE_SNE: return 0xd;
1233	default:
1234		assert(0);
1235		return 0;
1236	}
1237}
1238
1239static INLINE void
1240emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1241{
1242	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1243}
1244
1245static void
1246emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1247	 struct nv50_reg *v, struct nv50_reg *e)
1248{
1249	struct nv50_reg *temp = alloc_temp(pc, NULL);
1250
1251	emit_flop(pc, NV50_FLOP_LG2, temp, v);
1252	emit_mul(pc, temp, temp, e);
1253	emit_preex2(pc, temp, temp);
1254	emit_flop(pc, NV50_FLOP_EX2, dst, temp);
1255
1256	free_temp(pc, temp);
1257}
1258
1259static INLINE void
1260emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1261{
1262	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1263}
1264
1265static INLINE void
1266emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1267{
1268	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1269}
1270
1271static void
1272emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1273	 struct nv50_reg **src)
1274{
1275	struct nv50_reg *one = alloc_immd(pc, 1.0);
1276	struct nv50_reg *zero = alloc_immd(pc, 0.0);
1277	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1278	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
1279	struct nv50_reg *tmp[4];
1280	boolean allow32 = pc->allow32;
1281
1282	pc->allow32 = FALSE;
1283
1284	if (mask & (3 << 1)) {
1285		tmp[0] = alloc_temp(pc, NULL);
1286		emit_minmax(pc, 4, tmp[0], src[0], zero);
1287	}
1288
1289	if (mask & (1 << 2)) {
1290		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1291
1292		tmp[1] = temp_temp(pc);
1293		emit_minmax(pc, 4, tmp[1], src[1], zero);
1294
1295		tmp[3] = temp_temp(pc);
1296		emit_minmax(pc, 4, tmp[3], src[3], neg128);
1297		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1298
1299		emit_pow(pc, dst[2], tmp[1], tmp[3]);
1300		emit_mov(pc, dst[2], zero);
1301		set_pred(pc, 3, 0, pc->p->exec_tail);
1302	}
1303
1304	if (mask & (1 << 1))
1305		assimilate_temp(pc, dst[1], tmp[0]);
1306	else
1307	if (mask & (1 << 2))
1308		free_temp(pc, tmp[0]);
1309
1310	pc->allow32 = allow32;
1311
1312	/* do this last, in case src[i,j] == dst[0,3] */
1313	if (mask & (1 << 0))
1314		emit_mov(pc, dst[0], one);
1315
1316	if (mask & (1 << 3))
1317		emit_mov(pc, dst[3], one);
1318
1319	FREE(pos128);
1320	FREE(neg128);
1321	FREE(zero);
1322	FREE(one);
1323}
1324
1325static INLINE void
1326emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1327{
1328	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1329}
1330
1331static void
1332emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1333{
1334	struct nv50_program_exec *e;
1335	const int r_pred = 1;
1336
1337	e = exec(pc);
1338	e->inst[0] = 0x00000002; /* discard */
1339	set_long(pc, e); /* sets cond code to ALWAYS */
1340
1341	if (src) {
1342		unsigned cvn = CVT_F32_F32;
1343
1344		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
1345
1346		if (src->mod & NV50_MOD_NEG)
1347			cvn |= CVT_NEG;
1348		/* write predicate reg */
1349		emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1350	}
1351
1352	emit(pc, e);
1353}
1354
1355static struct nv50_program_exec *
1356emit_breakaddr(struct nv50_pc *pc)
1357{
1358	struct nv50_program_exec *e = exec(pc);
1359
1360	e->inst[0] = 0x40000002;
1361	set_long(pc, e);
1362
1363	emit(pc, e);
1364	return e;
1365}
1366
1367static void
1368emit_break(struct nv50_pc *pc, int pred, unsigned cc)
1369{
1370	struct nv50_program_exec *e = exec(pc);
1371
1372	e->inst[0] = 0x50000002;
1373	set_long(pc, e);
1374	if (pred >= 0)
1375		set_pred(pc, cc, pred, e);
1376
1377	emit(pc, e);
1378}
1379
1380static struct nv50_program_exec *
1381emit_joinat(struct nv50_pc *pc)
1382{
1383	struct nv50_program_exec *e = exec(pc);
1384
1385	e->inst[0] = 0xa0000002;
1386	set_long(pc, e);
1387
1388	emit(pc, e);
1389	return e;
1390}
1391
1392static struct nv50_program_exec *
1393emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
1394{
1395	struct nv50_program_exec *e = exec(pc);
1396
1397	e->inst[0] = 0x10000002;
1398	set_long(pc, e);
1399	if (pred >= 0)
1400		set_pred(pc, cc, pred, e);
1401	emit(pc, e);
1402	return pc->p->exec_tail;
1403}
1404
1405static void
1406emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
1407{
1408	struct nv50_program_exec *e = exec(pc);
1409
1410	e->inst[0] = 0x30000002;
1411	set_long(pc, e);
1412	if (pred >= 0)
1413		set_pred(pc, cc, pred, e);
1414
1415	emit(pc, e);
1416}
1417
1418#define QOP_ADD 0
1419#define QOP_SUBR 1
1420#define QOP_SUB 2
1421#define QOP_MOV_SRC1 3
1422
1423/* For a quad of threads / top left, top right, bottom left, bottom right
1424 * pixels, do a different operation, and take src0 from a specific thread.
1425 */
1426static void
1427emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
1428	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
1429{
1430       struct nv50_program_exec *e = exec(pc);
1431
1432       e->inst[0] = 0xc0000000;
1433       e->inst[1] = 0x80000000;
1434       set_long(pc, e);
1435       e->inst[0] |= lane_src0 << 16;
1436       set_src_0(pc, src0, e);
1437       set_src_2(pc, src1, e);
1438
1439       if (wp >= 0)
1440	       set_pred_wr(pc, 1, wp, e);
1441
1442       if (dst)
1443	       set_dst(pc, dst, e);
1444       else {
1445	       e->inst[0] |= 0x000001fc;
1446	       e->inst[1] |= 0x00000008;
1447       }
1448
1449       e->inst[0] |= (qop & 3) << 20;
1450       e->inst[1] |= (qop >> 2) << 22;
1451
1452       emit(pc, e);
1453}
1454
1455static void
1456load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1457		     struct nv50_reg **src, unsigned arg, boolean proj)
1458{
1459	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
1460
1461	src[0]->mod |= NV50_MOD_ABS;
1462	src[1]->mod |= NV50_MOD_ABS;
1463	src[2]->mod |= NV50_MOD_ABS;
1464
1465	emit_minmax(pc, 4, t[2], src[0], src[1]);
1466	emit_minmax(pc, 4, t[2], src[2], t[2]);
1467
1468	src[0]->mod = mod[0];
1469	src[1]->mod = mod[1];
1470	src[2]->mod = mod[2];
1471
1472	if (proj && 0 /* looks more correct without this */)
1473		emit_mul(pc, t[2], t[2], src[3]);
1474	else
1475	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
1476		emit_mov(pc, t[3], src[3]);
1477
1478	emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]);
1479
1480	emit_mul(pc, t[0], src[0], t[2]);
1481	emit_mul(pc, t[1], src[1], t[2]);
1482	emit_mul(pc, t[2], src[2], t[2]);
1483}
1484
1485static void
1486load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1487		     struct nv50_reg **src, unsigned dim, unsigned arg)
1488{
1489	unsigned c, mode;
1490
1491	if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1492		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
1493
1494		t[3]->rhw = src[3]->rhw;
1495		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1496		emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]);
1497
1498		for (c = 0; c < dim; ++c) {
1499			t[c]->rhw = src[c]->rhw;
1500			emit_interp(pc, t[c], t[3], mode);
1501		}
1502		if (arg != dim) { /* depth reference value */
1503			t[dim]->rhw = src[2]->rhw;
1504			emit_interp(pc, t[dim], t[3], mode);
1505		}
1506	} else {
1507		/* XXX: for some reason the blob sometimes uses MAD
1508		 * (mad f32 $rX $rY $rZ neg $r63)
1509		 */
1510		emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]);
1511		for (c = 0; c < dim; ++c)
1512			emit_mul(pc, t[c], src[c], t[3]);
1513		if (arg != dim) /* depth reference value */
1514			emit_mul(pc, t[dim], src[2], t[3]);
1515	}
1516}
1517
1518static INLINE void
1519get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
1520{
1521	switch (type) {
1522	case TGSI_TEXTURE_1D:
1523		*arg = *dim = 1;
1524		break;
1525	case TGSI_TEXTURE_SHADOW1D:
1526		*dim = 1;
1527		*arg = 2;
1528		break;
1529	case TGSI_TEXTURE_UNKNOWN:
1530	case TGSI_TEXTURE_2D:
1531	case TGSI_TEXTURE_RECT:
1532		*arg = *dim = 2;
1533		break;
1534	case TGSI_TEXTURE_SHADOW2D:
1535	case TGSI_TEXTURE_SHADOWRECT:
1536		*dim = 2;
1537		*arg = 3;
1538		break;
1539	case TGSI_TEXTURE_3D:
1540	case TGSI_TEXTURE_CUBE:
1541		*dim = *arg = 3;
1542		break;
1543	default:
1544		assert(0);
1545		break;
1546	}
1547}
1548
1549/* We shouldn't execute TEXLOD if any of the pixels in a quad have
1550 * different LOD values, so branch off groups of equal LOD.
1551 */
1552static void
1553emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
1554		     struct nv50_reg *src, struct nv50_program_exec *tex)
1555{
1556	struct nv50_program_exec *join_at;
1557	unsigned i, target = pc->p->exec_size + 7 * 2;
1558
1559	/* Subtract lod of each pixel from lod of top left pixel, jump
1560	 * texlod insn if result is 0, then repeat for 2 other pixels.
1561	 */
1562	join_at = emit_joinat(pc);
1563	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
1564	emit_branch(pc, 0, 2)->param.index = target;
1565
1566	for (i = 1; i < 4; ++i) {
1567		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
1568		emit_branch(pc, 0, 2)->param.index = target;
1569	}
1570
1571	emit_mov(pc, tlod, src); /* target */
1572	emit(pc, tex); /* texlod */
1573
1574	join_at->param.index = target + 2 * 2;
1575	JOIN_ON(emit_nop(pc)); /* join _after_ tex */
1576}
1577
1578static void
1579emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
1580		      struct nv50_program_exec *tex)
1581{
1582	struct nv50_program_exec *e;
1583	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
1584	int r_pred = 0;
1585	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
1586
1587	pc->allow32 = FALSE;
1588	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
1589
1590	/* Subtract bias value of thread i from bias values of each thread,
1591	 * store result in r_pred, and set bit i in r_bits if result was 0.
1592	 */
1593	assert(arg < 4);
1594	for (i = 0; i < 4; ++i, ++imm_1248.hw) {
1595		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
1596		emit_mov(pc, r_bits, &imm_1248);
1597		set_pred(pc, 2, r_pred, pc->p->exec_tail);
1598	}
1599	emit_mov_to_pred(pc, r_pred, r_bits);
1600
1601	/* The lanes of a quad are now grouped by the bit in r_pred they have
1602	 * set. Put the input values for TEX into a new register set for each
1603	 * group and execute TEX only for a specific group.
1604	 * We cannot use the same register set for each group because we need
1605	 * the derivatives, which are implicitly calculated, to be correct.
1606	 */
1607	for (i = 1; i < 4; ++i) {
1608		alloc_temp4(pc, t123[i], 0);
1609
1610		for (c = 0; c <= arg; ++c)
1611			emit_mov(pc, t123[i][c], t[c]);
1612
1613		*(e = exec(pc)) = *(tex);
1614		e->inst[0] &= ~0x01fc;
1615		set_dst(pc, t123[i][0], e);
1616		set_pred(pc, cc[i], r_pred, e);
1617		emit(pc, e);
1618	}
1619	/* finally TEX on the original regs (where we kept the input) */
1620	set_pred(pc, cc[0], r_pred, tex);
1621	emit(pc, tex);
1622
1623	/* put the 3 * n other results into regs for lane 0 */
1624	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
1625	for (i = 1; i < 4; ++i) {
1626		for (c = 0; c < n; ++c) {
1627			emit_mov(pc, t[c], t123[i][c]);
1628			set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
1629		}
1630		free_temp4(pc, t123[i]);
1631	}
1632
1633	emit_nop(pc);
1634	free_temp(pc, r_bits);
1635}
1636
1637static void
1638emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1639	 struct nv50_reg **src, unsigned unit, unsigned type,
1640	 boolean proj, int bias_lod)
1641{
1642	struct nv50_reg *t[4];
1643	struct nv50_program_exec *e;
1644	unsigned c, dim, arg;
1645
1646	/* t[i] must be within a single 128 bit super-reg */
1647	alloc_temp4(pc, t, 0);
1648
1649	e = exec(pc);
1650	e->inst[0] = 0xf0000000;
1651	set_long(pc, e);
1652	set_dst(pc, t[0], e);
1653
1654	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
1655	e->inst[0] |= (unit << 9) /* | (unit << 17) */;
1656
1657	/* live flag (don't set if TEX results affect input to another TEX): */
1658	/* e->inst[0] |= 0x00000004; */
1659
1660	get_tex_dim(type, &dim, &arg);
1661
1662	if (type == TGSI_TEXTURE_CUBE) {
1663		e->inst[0] |= 0x08000000;
1664		load_cube_tex_coords(pc, t, src, arg, proj);
1665	} else
1666	if (proj)
1667		load_proj_tex_coords(pc, t, src, dim, arg);
1668	else {
1669		for (c = 0; c < dim; c++)
1670			emit_mov(pc, t[c], src[c]);
1671		if (arg != dim) /* depth reference value (always src.z here) */
1672			emit_mov(pc, t[dim], src[2]);
1673	}
1674
1675	e->inst[0] |= (mask & 0x3) << 25;
1676	e->inst[1] |= (mask & 0xc) << 12;
1677
1678	if (!bias_lod) {
1679		e->inst[0] |= (arg - 1) << 22;
1680		emit(pc, e);
1681	} else
1682	if (bias_lod < 0) {
1683		e->inst[0] |= arg << 22;
1684		e->inst[1] |= 0x20000000; /* texbias */
1685		emit_mov(pc, t[arg], src[3]);
1686		emit_texbias_sequence(pc, t, arg, e);
1687	} else {
1688		e->inst[0] |= arg << 22;
1689		e->inst[1] |= 0x40000000; /* texlod */
1690		emit_mov(pc, t[arg], src[3]);
1691		emit_texlod_sequence(pc, t[arg], src[3], e);
1692	}
1693
1694#if 1
1695	c = 0;
1696	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1697	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1698	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1699	if (mask & 8) emit_mov(pc, dst[3], t[c]);
1700
1701	free_temp4(pc, t);
1702#else
1703	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1704	 * the texture coordinates, not the fetched values: latency ? */
1705
1706	for (c = 0; c < 4; c++) {
1707		if (mask & (1 << c))
1708			assimilate_temp(pc, dst[c], t[c]);
1709		else
1710			free_temp(pc, t[c]);
1711	}
1712#endif
1713}
1714
1715static void
1716emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1717{
1718	struct nv50_program_exec *e = exec(pc);
1719
1720	assert(src->type == P_TEMP);
1721
1722	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000;
1723	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000;
1724	set_long(pc, e);
1725	set_dst(pc, dst, e);
1726	set_src_0(pc, src, e);
1727	set_src_2(pc, src, e);
1728
1729	emit(pc, e);
1730}
1731
1732static void
1733emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1734{
1735	struct nv50_program_exec *e = exec(pc);
1736
1737	assert(src->type == P_TEMP);
1738
1739	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000;
1740	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000;
1741	set_long(pc, e);
1742	set_dst(pc, dst, e);
1743	set_src_0(pc, src, e);
1744	set_src_2(pc, src, e);
1745
1746	emit(pc, e);
1747}
1748
1749static void
1750convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1751{
1752	unsigned q = 0, m = ~0;
1753
1754	assert(!is_long(e));
1755
1756	switch (e->inst[0] >> 28) {
1757	case 0x1:
1758		/* MOV */
1759		q = 0x0403c000;
1760		m = 0xffff7fff;
1761		break;
1762	case 0x8:
1763		/* INTERP (move centroid, perspective and flat bits) */
1764		m = ~0x03000100;
1765		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1766		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1767		break;
1768	case 0x9:
1769		/* RCP */
1770		break;
1771	case 0xB:
1772		/* ADD */
1773		m = ~(127 << 16);
1774		q = ((e->inst[0] & (~m)) >> 2);
1775		break;
1776	case 0xC:
1777		/* MUL */
1778		m = ~0x00008000;
1779		q = ((e->inst[0] & (~m)) << 12);
1780		break;
1781	case 0xE:
1782		/* MAD (if src2 == dst) */
1783		q = ((e->inst[0] & 0x1fc) << 12);
1784		break;
1785	default:
1786		assert(0);
1787		break;
1788	}
1789
1790	set_long(pc, e);
1791	pc->p->exec_size++;
1792
1793	e->inst[0] &= m;
1794	e->inst[1] |= q;
1795}
1796
1797/* Some operations support an optional negation flag. */
1798static boolean
1799negate_supported(const struct tgsi_full_instruction *insn, int i)
1800{
1801	switch (insn->Instruction.Opcode) {
1802	case TGSI_OPCODE_ADD:
1803	case TGSI_OPCODE_COS:
1804	case TGSI_OPCODE_DDX:
1805	case TGSI_OPCODE_DDY:
1806	case TGSI_OPCODE_DP3:
1807	case TGSI_OPCODE_DP4:
1808	case TGSI_OPCODE_EX2:
1809	case TGSI_OPCODE_KIL:
1810	case TGSI_OPCODE_LG2:
1811	case TGSI_OPCODE_MAD:
1812	case TGSI_OPCODE_MUL:
1813	case TGSI_OPCODE_POW:
1814	case TGSI_OPCODE_RCP:
1815	case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */
1816	case TGSI_OPCODE_SCS:
1817	case TGSI_OPCODE_SIN:
1818	case TGSI_OPCODE_SUB:
1819		return TRUE;
1820	default:
1821		return FALSE;
1822	}
1823}
1824
1825/* Return a read mask for source registers deduced from opcode & write mask. */
1826static unsigned
1827nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1828{
1829	unsigned x, mask = insn->Dst[0].Register.WriteMask;
1830
1831	switch (insn->Instruction.Opcode) {
1832	case TGSI_OPCODE_COS:
1833	case TGSI_OPCODE_SIN:
1834		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1835	case TGSI_OPCODE_DP3:
1836		return 0x7;
1837	case TGSI_OPCODE_DP4:
1838	case TGSI_OPCODE_DPH:
1839	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1840		return 0xf;
1841	case TGSI_OPCODE_DST:
1842		return mask & (c ? 0xa : 0x6);
1843	case TGSI_OPCODE_EX2:
1844	case TGSI_OPCODE_LG2:
1845	case TGSI_OPCODE_POW:
1846	case TGSI_OPCODE_RCP:
1847	case TGSI_OPCODE_RSQ:
1848	case TGSI_OPCODE_SCS:
1849		return 0x1;
1850	case TGSI_OPCODE_IF:
1851		return 0x1;
1852	case TGSI_OPCODE_LIT:
1853		return 0xb;
1854	case TGSI_OPCODE_TEX:
1855	case TGSI_OPCODE_TXB:
1856	case TGSI_OPCODE_TXL:
1857	case TGSI_OPCODE_TXP:
1858	{
1859		const struct tgsi_instruction_texture *tex;
1860
1861		assert(insn->Instruction.Texture);
1862		tex = &insn->Texture;
1863
1864		mask = 0x7;
1865		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
1866		    insn->Instruction.Opcode != TGSI_OPCODE_TXD)
1867			mask |= 0x8; /* bias, lod or proj */
1868
1869		switch (tex->Texture) {
1870		case TGSI_TEXTURE_1D:
1871			mask &= 0x9;
1872			break;
1873		case TGSI_TEXTURE_SHADOW1D:
1874			mask &= 0x5;
1875			break;
1876		case TGSI_TEXTURE_2D:
1877			mask &= 0xb;
1878			break;
1879		default:
1880			break;
1881		}
1882	}
1883		return mask;
1884	case TGSI_OPCODE_XPD:
1885		x = 0;
1886		if (mask & 1) x |= 0x6;
1887		if (mask & 2) x |= 0x5;
1888		if (mask & 4) x |= 0x3;
1889		return x;
1890	default:
1891		break;
1892	}
1893
1894	return mask;
1895}
1896
1897static struct nv50_reg *
1898tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1899{
1900	switch (dst->Register.File) {
1901	case TGSI_FILE_TEMPORARY:
1902		return &pc->temp[dst->Register.Index * 4 + c];
1903	case TGSI_FILE_OUTPUT:
1904		return &pc->result[dst->Register.Index * 4 + c];
1905	case TGSI_FILE_ADDRESS:
1906	{
1907		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
1908		if (!r) {
1909			r = alloc_addr(pc, NULL);
1910			pc->addr[dst->Register.Index * 4 + c] = r;
1911		}
1912		assert(r);
1913		return r;
1914	}
1915	case TGSI_FILE_NULL:
1916		return NULL;
1917	default:
1918		break;
1919	}
1920
1921	return NULL;
1922}
1923
1924static struct nv50_reg *
1925tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1926	 boolean neg)
1927{
1928	struct nv50_reg *r = NULL;
1929	struct nv50_reg *temp;
1930	unsigned sgn, c, swz;
1931
1932	if (src->Register.File != TGSI_FILE_CONSTANT)
1933		assert(!src->Register.Indirect);
1934
1935	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1936
1937	c = tgsi_util_get_full_src_register_swizzle(src, chan);
1938	switch (c) {
1939	case TGSI_SWIZZLE_X:
1940	case TGSI_SWIZZLE_Y:
1941	case TGSI_SWIZZLE_Z:
1942	case TGSI_SWIZZLE_W:
1943		switch (src->Register.File) {
1944		case TGSI_FILE_INPUT:
1945			r = &pc->attr[src->Register.Index * 4 + c];
1946			break;
1947		case TGSI_FILE_TEMPORARY:
1948			r = &pc->temp[src->Register.Index * 4 + c];
1949			break;
1950		case TGSI_FILE_CONSTANT:
1951			if (!src->Register.Indirect) {
1952				r = &pc->param[src->Register.Index * 4 + c];
1953				break;
1954			}
1955			/* Indicate indirection by setting r->acc < 0 and
1956			 * use the index field to select the address reg.
1957			 */
1958			r = reg_instance(pc, NULL);
1959			swz = tgsi_util_get_src_register_swizzle(
1960						 &src->Indirect, 0);
1961			ctor_reg(r, P_CONST,
1962				 src->Indirect.Index * 4 + swz,
1963				 src->Register.Index * 4 + c);
1964			r->acc = -1;
1965			break;
1966		case TGSI_FILE_IMMEDIATE:
1967			r = &pc->immd[src->Register.Index * 4 + c];
1968			break;
1969		case TGSI_FILE_SAMPLER:
1970			break;
1971		case TGSI_FILE_ADDRESS:
1972			r = pc->addr[src->Register.Index * 4 + c];
1973			assert(r);
1974			break;
1975		default:
1976			assert(0);
1977			break;
1978		}
1979		break;
1980	default:
1981		assert(0);
1982		break;
1983	}
1984
1985	switch (sgn) {
1986	case TGSI_UTIL_SIGN_KEEP:
1987		break;
1988	case TGSI_UTIL_SIGN_CLEAR:
1989		temp = temp_temp(pc);
1990		emit_abs(pc, temp, r);
1991		r = temp;
1992		break;
1993	case TGSI_UTIL_SIGN_TOGGLE:
1994		if (neg)
1995			r->mod = NV50_MOD_NEG;
1996		else {
1997			temp = temp_temp(pc);
1998			emit_neg(pc, temp, r);
1999			r = temp;
2000		}
2001		break;
2002	case TGSI_UTIL_SIGN_SET:
2003		temp = temp_temp(pc);
2004		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
2005		r = temp;
2006		break;
2007	default:
2008		assert(0);
2009		break;
2010	}
2011
2012	if (r && r->acc >= 0 && r != temp)
2013		return reg_instance(pc, r);
2014	return r;
2015}
2016
2017/* return TRUE for ops that produce only a single result */
2018static boolean
2019is_scalar_op(unsigned op)
2020{
2021	switch (op) {
2022	case TGSI_OPCODE_COS:
2023	case TGSI_OPCODE_DP2:
2024	case TGSI_OPCODE_DP3:
2025	case TGSI_OPCODE_DP4:
2026	case TGSI_OPCODE_DPH:
2027	case TGSI_OPCODE_EX2:
2028	case TGSI_OPCODE_LG2:
2029	case TGSI_OPCODE_POW:
2030	case TGSI_OPCODE_RCP:
2031	case TGSI_OPCODE_RSQ:
2032	case TGSI_OPCODE_SIN:
2033		/*
2034	case TGSI_OPCODE_KIL:
2035	case TGSI_OPCODE_LIT:
2036	case TGSI_OPCODE_SCS:
2037		*/
2038		return TRUE;
2039	default:
2040		return FALSE;
2041	}
2042}
2043
2044/* Returns a bitmask indicating which dst components depend
2045 * on source s, component c (reverse of nv50_tgsi_src_mask).
2046 */
2047static unsigned
2048nv50_tgsi_dst_revdep(unsigned op, int s, int c)
2049{
2050	if (is_scalar_op(op))
2051		return 0x1;
2052
2053	switch (op) {
2054	case TGSI_OPCODE_DST:
2055		return (1 << c) & (s ? 0xa : 0x6);
2056	case TGSI_OPCODE_XPD:
2057		switch (c) {
2058		case 0: return 0x6;
2059		case 1: return 0x5;
2060		case 2: return 0x3;
2061		case 3: return 0x0;
2062		default:
2063			assert(0);
2064			return 0x0;
2065		}
2066	case TGSI_OPCODE_LIT:
2067	case TGSI_OPCODE_SCS:
2068	case TGSI_OPCODE_TEX:
2069	case TGSI_OPCODE_TXB:
2070	case TGSI_OPCODE_TXL:
2071	case TGSI_OPCODE_TXP:
2072		/* these take care of dangerous swizzles themselves */
2073		return 0x0;
2074	case TGSI_OPCODE_IF:
2075	case TGSI_OPCODE_KIL:
2076		/* don't call this function for these ops */
2077		assert(0);
2078		return 0;
2079	default:
2080		/* linear vector instruction */
2081		return (1 << c);
2082	}
2083}
2084
2085static INLINE boolean
2086has_pred(struct nv50_program_exec *e, unsigned cc)
2087{
2088	if (!is_long(e) || is_immd(e))
2089		return FALSE;
2090	return ((e->inst[1] & 0x780) == (cc << 7));
2091}
2092
2093/* on ENDIF see if we can do "@p0.neu single_op" instead of:
2094 *        join_at ENDIF
2095 *        @p0.eq bra ENDIF
2096 *        single_op
2097 * ENDIF: nop.join
2098 */
2099static boolean
2100nv50_kill_branch(struct nv50_pc *pc)
2101{
2102	int lvl = pc->if_lvl;
2103
2104	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
2105		return FALSE;
2106
2107	/* if ccode == 'true', the BRA is from an ELSE and the predicate
2108	 * reg may no longer be valid, since we currently always use $p0
2109	 */
2110	if (has_pred(pc->if_insn[lvl], 0xf))
2111		return FALSE;
2112	assert(pc->if_insn[lvl] && pc->if_join[lvl]);
2113
2114	/* We'll use the exec allocated for JOIN_AT (we can't easily
2115	 * access nv50_program_exec's prev).
2116	 */
2117	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
2118
2119	*pc->if_join[lvl] = *pc->p->exec_tail;
2120
2121	FREE(pc->if_insn[lvl]);
2122	FREE(pc->p->exec_tail);
2123
2124	pc->p->exec_tail = pc->if_join[lvl];
2125	pc->p->exec_tail->next = NULL;
2126	set_pred(pc, 0xd, 0, pc->p->exec_tail);
2127
2128	return TRUE;
2129}
2130
2131static void
2132nv50_fp_move_results(struct nv50_pc *pc)
2133{
2134	struct nv50_reg reg;
2135	unsigned i;
2136
2137	ctor_reg(&reg, P_TEMP, -1, -1);
2138
2139	for (i = 0; i < pc->result_nr * 4; ++i) {
2140		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2141			continue;
2142		if (pc->result[i].rhw != pc->result[i].hw) {
2143			reg.hw = pc->result[i].rhw;
2144			emit_mov(pc, &reg, &pc->result[i]);
2145		}
2146	}
2147}
2148
2149static boolean
2150nv50_program_tx_insn(struct nv50_pc *pc,
2151		     const struct tgsi_full_instruction *inst)
2152{
2153	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
2154	unsigned mask, sat, unit;
2155	int i, c;
2156
2157	mask = inst->Dst[0].Register.WriteMask;
2158	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
2159
2160	memset(src, 0, sizeof(src));
2161
2162	for (c = 0; c < 4; c++) {
2163		if ((mask & (1 << c)) && !pc->r_dst[c])
2164			dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
2165		else
2166			dst[c] = pc->r_dst[c];
2167		rdst[c] = dst[c];
2168	}
2169
2170	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2171		const struct tgsi_full_src_register *fs = &inst->Src[i];
2172		unsigned src_mask;
2173		boolean neg_supp;
2174
2175		src_mask = nv50_tgsi_src_mask(inst, i);
2176		neg_supp = negate_supported(inst, i);
2177
2178		if (fs->Register.File == TGSI_FILE_SAMPLER)
2179			unit = fs->Register.Index;
2180
2181		for (c = 0; c < 4; c++)
2182			if (src_mask & (1 << c))
2183				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
2184	}
2185
2186	brdc = temp = pc->r_brdc;
2187	if (brdc && brdc->type != P_TEMP) {
2188		temp = temp_temp(pc);
2189		if (sat)
2190			brdc = temp;
2191	} else
2192	if (sat) {
2193		for (c = 0; c < 4; c++) {
2194			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
2195				continue;
2196			/* rdst[c] = dst[c]; */ /* done above */
2197			dst[c] = temp_temp(pc);
2198		}
2199	}
2200
2201	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
2202
2203	switch (inst->Instruction.Opcode) {
2204	case TGSI_OPCODE_ABS:
2205		for (c = 0; c < 4; c++) {
2206			if (!(mask & (1 << c)))
2207				continue;
2208			emit_abs(pc, dst[c], src[0][c]);
2209		}
2210		break;
2211	case TGSI_OPCODE_ADD:
2212		for (c = 0; c < 4; c++) {
2213			if (!(mask & (1 << c)))
2214				continue;
2215			emit_add(pc, dst[c], src[0][c], src[1][c]);
2216		}
2217		break;
2218	case TGSI_OPCODE_AND:
2219	case TGSI_OPCODE_XOR:
2220	case TGSI_OPCODE_OR:
2221		for (c = 0; c < 4; c++) {
2222			if (!(mask & (1 << c)))
2223				continue;
2224			emit_bitop2(pc, dst[c], src[0][c], src[1][c],
2225				    inst->Instruction.Opcode);
2226		}
2227		break;
2228	case TGSI_OPCODE_ARL:
2229		assert(src[0][0]);
2230		temp = temp_temp(pc);
2231		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
2232		emit_arl(pc, dst[0], temp, 4);
2233		break;
2234	case TGSI_OPCODE_BGNLOOP:
2235		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
2236		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
2237		terminate_mbb(pc);
2238		break;
2239	case TGSI_OPCODE_BRK:
2240		assert(pc->loop_lvl > 0);
2241		emit_break(pc, -1, 0);
2242		break;
2243	case TGSI_OPCODE_CEIL:
2244		for (c = 0; c < 4; c++) {
2245			if (!(mask & (1 << c)))
2246				continue;
2247			emit_cvt(pc, dst[c], src[0][c], -1,
2248				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
2249		}
2250		break;
2251	case TGSI_OPCODE_CMP:
2252		pc->allow32 = FALSE;
2253		for (c = 0; c < 4; c++) {
2254			if (!(mask & (1 << c)))
2255				continue;
2256			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
2257			emit_mov(pc, dst[c], src[1][c]);
2258			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
2259			emit_mov(pc, dst[c], src[2][c]);
2260			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
2261		}
2262		break;
2263	case TGSI_OPCODE_CONT:
2264		assert(pc->loop_lvl > 0);
2265		emit_branch(pc, -1, 0)->param.index =
2266			pc->loop_pos[pc->loop_lvl - 1];
2267		break;
2268	case TGSI_OPCODE_COS:
2269		if (mask & 8) {
2270			emit_precossin(pc, temp, src[0][3]);
2271			emit_flop(pc, NV50_FLOP_COS, dst[3], temp);
2272			if (!(mask &= 7))
2273				break;
2274			if (temp == dst[3])
2275				temp = brdc = temp_temp(pc);
2276		}
2277		emit_precossin(pc, temp, src[0][0]);
2278		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
2279		break;
2280	case TGSI_OPCODE_DDX:
2281		for (c = 0; c < 4; c++) {
2282			if (!(mask & (1 << c)))
2283				continue;
2284			emit_ddx(pc, dst[c], src[0][c]);
2285		}
2286		break;
2287	case TGSI_OPCODE_DDY:
2288		for (c = 0; c < 4; c++) {
2289			if (!(mask & (1 << c)))
2290				continue;
2291			emit_ddy(pc, dst[c], src[0][c]);
2292		}
2293		break;
2294	case TGSI_OPCODE_DP3:
2295		emit_mul(pc, temp, src[0][0], src[1][0]);
2296		emit_mad(pc, temp, src[0][1], src[1][1], temp);
2297		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
2298		break;
2299	case TGSI_OPCODE_DP4:
2300		emit_mul(pc, temp, src[0][0], src[1][0]);
2301		emit_mad(pc, temp, src[0][1], src[1][1], temp);
2302		emit_mad(pc, temp, src[0][2], src[1][2], temp);
2303		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
2304		break;
2305	case TGSI_OPCODE_DPH:
2306		emit_mul(pc, temp, src[0][0], src[1][0]);
2307		emit_mad(pc, temp, src[0][1], src[1][1], temp);
2308		emit_mad(pc, temp, src[0][2], src[1][2], temp);
2309		emit_add(pc, brdc, src[1][3], temp);
2310		break;
2311	case TGSI_OPCODE_DST:
2312		if (mask & (1 << 1))
2313			emit_mul(pc, dst[1], src[0][1], src[1][1]);
2314		if (mask & (1 << 2))
2315			emit_mov(pc, dst[2], src[0][2]);
2316		if (mask & (1 << 3))
2317			emit_mov(pc, dst[3], src[1][3]);
2318		if (mask & (1 << 0))
2319			emit_mov_immdval(pc, dst[0], 1.0f);
2320		break;
2321	case TGSI_OPCODE_ELSE:
2322		emit_branch(pc, -1, 0);
2323		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2324		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2325		terminate_mbb(pc);
2326		break;
2327	case TGSI_OPCODE_ENDIF:
2328		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2329
2330		/* try to replace branch over 1 insn with a predicated insn */
2331		if (nv50_kill_branch(pc) == TRUE)
2332			break;
2333
2334		if (pc->if_join[pc->if_lvl]) {
2335			pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size;
2336			pc->if_join[pc->if_lvl] = NULL;
2337		}
2338		terminate_mbb(pc);
2339		/* emit a NOP as join point, we could set it on the next
2340		 * one, but would have to make sure it is long and !immd
2341		 */
2342		JOIN_ON(emit_nop(pc));
2343		break;
2344	case TGSI_OPCODE_ENDLOOP:
2345		emit_branch(pc, -1, 0)->param.index =
2346			pc->loop_pos[--pc->loop_lvl];
2347		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
2348		terminate_mbb(pc);
2349		break;
2350	case TGSI_OPCODE_EX2:
2351		emit_preex2(pc, temp, src[0][0]);
2352		emit_flop(pc, NV50_FLOP_EX2, brdc, temp);
2353		break;
2354	case TGSI_OPCODE_FLR:
2355		for (c = 0; c < 4; c++) {
2356			if (!(mask & (1 << c)))
2357				continue;
2358			emit_flr(pc, dst[c], src[0][c]);
2359		}
2360		break;
2361	case TGSI_OPCODE_FRC:
2362		temp = temp_temp(pc);
2363		for (c = 0; c < 4; c++) {
2364			if (!(mask & (1 << c)))
2365				continue;
2366			emit_flr(pc, temp, src[0][c]);
2367			emit_sub(pc, dst[c], src[0][c], temp);
2368		}
2369		break;
2370	case TGSI_OPCODE_IF:
2371		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
2372		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
2373			 CVT_F32_F32);
2374		pc->if_join[pc->if_lvl] = emit_joinat(pc);
2375		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
2376		terminate_mbb(pc);
2377		break;
2378	case TGSI_OPCODE_KIL:
2379		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
2380		emit_kil(pc, src[0][0]);
2381		emit_kil(pc, src[0][1]);
2382		emit_kil(pc, src[0][2]);
2383		emit_kil(pc, src[0][3]);
2384		break;
2385	case TGSI_OPCODE_KILP:
2386		emit_kil(pc, NULL);
2387		break;
2388	case TGSI_OPCODE_LIT:
2389		emit_lit(pc, &dst[0], mask, &src[0][0]);
2390		break;
2391	case TGSI_OPCODE_LG2:
2392		emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]);
2393		break;
2394	case TGSI_OPCODE_LRP:
2395		temp = temp_temp(pc);
2396		for (c = 0; c < 4; c++) {
2397			if (!(mask & (1 << c)))
2398				continue;
2399			emit_sub(pc, temp, src[1][c], src[2][c]);
2400			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
2401		}
2402		break;
2403	case TGSI_OPCODE_MAD:
2404		for (c = 0; c < 4; c++) {
2405			if (!(mask & (1 << c)))
2406				continue;
2407			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2408		}
2409		break;
2410	case TGSI_OPCODE_MAX:
2411		for (c = 0; c < 4; c++) {
2412			if (!(mask & (1 << c)))
2413				continue;
2414			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2415		}
2416		break;
2417	case TGSI_OPCODE_MIN:
2418		for (c = 0; c < 4; c++) {
2419			if (!(mask & (1 << c)))
2420				continue;
2421			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2422		}
2423		break;
2424	case TGSI_OPCODE_MOV:
2425		for (c = 0; c < 4; c++) {
2426			if (!(mask & (1 << c)))
2427				continue;
2428			emit_mov(pc, dst[c], src[0][c]);
2429		}
2430		break;
2431	case TGSI_OPCODE_MUL:
2432		for (c = 0; c < 4; c++) {
2433			if (!(mask & (1 << c)))
2434				continue;
2435			emit_mul(pc, dst[c], src[0][c], src[1][c]);
2436		}
2437		break;
2438	case TGSI_OPCODE_POW:
2439		emit_pow(pc, brdc, src[0][0], src[1][0]);
2440		break;
2441	case TGSI_OPCODE_RCP:
2442		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
2443		break;
2444	case TGSI_OPCODE_RET:
2445		if (pc->p->type == PIPE_SHADER_FRAGMENT)
2446			nv50_fp_move_results(pc);
2447		emit_ret(pc, -1, 0);
2448		break;
2449	case TGSI_OPCODE_RSQ:
2450		src[0][0]->mod |= NV50_MOD_ABS;
2451		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
2452		break;
2453	case TGSI_OPCODE_SCS:
2454		temp = temp_temp(pc);
2455		if (mask & 3)
2456			emit_precossin(pc, temp, src[0][0]);
2457		if (mask & (1 << 0))
2458			emit_flop(pc, NV50_FLOP_COS, dst[0], temp);
2459		if (mask & (1 << 1))
2460			emit_flop(pc, NV50_FLOP_SIN, dst[1], temp);
2461		if (mask & (1 << 2))
2462			emit_mov_immdval(pc, dst[2], 0.0);
2463		if (mask & (1 << 3))
2464			emit_mov_immdval(pc, dst[3], 1.0);
2465		break;
2466	case TGSI_OPCODE_SIN:
2467		if (mask & 8) {
2468			emit_precossin(pc, temp, src[0][3]);
2469			emit_flop(pc, NV50_FLOP_SIN, dst[3], temp);
2470			if (!(mask &= 7))
2471				break;
2472			if (temp == dst[3])
2473				temp = brdc = temp_temp(pc);
2474		}
2475		emit_precossin(pc, temp, src[0][0]);
2476		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
2477		break;
2478	case TGSI_OPCODE_SLT:
2479	case TGSI_OPCODE_SGE:
2480	case TGSI_OPCODE_SEQ:
2481	case TGSI_OPCODE_SGT:
2482	case TGSI_OPCODE_SLE:
2483	case TGSI_OPCODE_SNE:
2484		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2485		for (c = 0; c < 4; c++) {
2486			if (!(mask & (1 << c)))
2487				continue;
2488			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2489		}
2490		break;
2491	case TGSI_OPCODE_SUB:
2492		for (c = 0; c < 4; c++) {
2493			if (!(mask & (1 << c)))
2494				continue;
2495			emit_sub(pc, dst[c], src[0][c], src[1][c]);
2496		}
2497		break;
2498	case TGSI_OPCODE_TEX:
2499		emit_tex(pc, dst, mask, src[0], unit,
2500			 inst->Texture.Texture, FALSE, 0);
2501		break;
2502	case TGSI_OPCODE_TXB:
2503		emit_tex(pc, dst, mask, src[0], unit,
2504			 inst->Texture.Texture, FALSE, -1);
2505		break;
2506	case TGSI_OPCODE_TXL:
2507		emit_tex(pc, dst, mask, src[0], unit,
2508			 inst->Texture.Texture, FALSE, 1);
2509		break;
2510	case TGSI_OPCODE_TXP:
2511		emit_tex(pc, dst, mask, src[0], unit,
2512			 inst->Texture.Texture, TRUE, 0);
2513		break;
2514	case TGSI_OPCODE_TRUNC:
2515		for (c = 0; c < 4; c++) {
2516			if (!(mask & (1 << c)))
2517				continue;
2518			emit_cvt(pc, dst[c], src[0][c], -1,
2519				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2520		}
2521		break;
2522	case TGSI_OPCODE_XPD:
2523		temp = temp_temp(pc);
2524		if (mask & (1 << 0)) {
2525			emit_mul(pc, temp, src[0][2], src[1][1]);
2526			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2527		}
2528		if (mask & (1 << 1)) {
2529			emit_mul(pc, temp, src[0][0], src[1][2]);
2530			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2531		}
2532		if (mask & (1 << 2)) {
2533			emit_mul(pc, temp, src[0][1], src[1][0]);
2534			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2535		}
2536		if (mask & (1 << 3))
2537			emit_mov_immdval(pc, dst[3], 1.0);
2538		break;
2539	case TGSI_OPCODE_END:
2540		break;
2541	default:
2542		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2543		return FALSE;
2544	}
2545
2546	if (brdc) {
2547		if (sat)
2548			emit_sat(pc, brdc, brdc);
2549		for (c = 0; c < 4; c++)
2550			if ((mask & (1 << c)) && dst[c] != brdc)
2551				emit_mov(pc, dst[c], brdc);
2552	} else
2553	if (sat) {
2554		for (c = 0; c < 4; c++) {
2555			if (!(mask & (1 << c)))
2556				continue;
2557			/* In this case we saturate later, and dst[c] won't
2558			 * be another temp_temp (and thus lost), since rdst
2559			 * already is TEMP (see above). */
2560			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
2561				continue;
2562			emit_sat(pc, rdst[c], dst[c]);
2563		}
2564	}
2565
2566	kill_temp_temp(pc);
2567	pc->reg_instance_nr = 0;
2568
2569	return TRUE;
2570}
2571
2572static void
2573prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2574{
2575	struct nv50_reg *reg = NULL;
2576	const struct tgsi_full_src_register *src;
2577	const struct tgsi_dst_register *dst;
2578	unsigned i, c, k, mask;
2579
2580	dst = &insn->Dst[0].Register;
2581	mask = dst->WriteMask;
2582
2583        if (dst->File == TGSI_FILE_TEMPORARY)
2584		reg = pc->temp;
2585        else
2586	if (dst->File == TGSI_FILE_OUTPUT) {
2587		reg = pc->result;
2588
2589		if (insn->Instruction.Opcode == TGSI_OPCODE_MOV &&
2590		    dst->Index == pc->edgeflag_out &&
2591		    insn->Src[0].Register.File == TGSI_FILE_INPUT)
2592			pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index;
2593	}
2594
2595	if (reg) {
2596		for (c = 0; c < 4; c++) {
2597			if (!(mask & (1 << c)))
2598				continue;
2599			reg[dst->Index * 4 + c].acc = pc->insn_nr;
2600		}
2601	}
2602
2603	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2604		src = &insn->Src[i];
2605
2606		if (src->Register.File == TGSI_FILE_TEMPORARY)
2607			reg = pc->temp;
2608		else
2609		if (src->Register.File == TGSI_FILE_INPUT)
2610			reg = pc->attr;
2611		else
2612			continue;
2613
2614		mask = nv50_tgsi_src_mask(insn, i);
2615
2616		for (c = 0; c < 4; c++) {
2617			if (!(mask & (1 << c)))
2618				continue;
2619			k = tgsi_util_get_full_src_register_swizzle(src, c);
2620
2621			reg[src->Register.Index * 4 + k].acc = pc->insn_nr;
2622		}
2623	}
2624}
2625
2626/* Returns a bitmask indicating which dst components need to be
2627 * written to temporaries first to avoid 'corrupting' sources.
2628 *
2629 * m[i]   (out) indicate component to write in the i-th position
2630 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2631 */
2632static unsigned
2633nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2634{
2635	unsigned i, c, x, unsafe;
2636
2637	for (c = 0; c < 4; c++)
2638		m[c] = c;
2639
2640	/* Swap as long as a dst component written earlier is depended on
2641	 * by one written later, but the next one isn't depended on by it.
2642	 */
2643	for (c = 0; c < 3; c++) {
2644		if (rdep[m[c + 1]] & (1 << m[c]))
2645			continue; /* if next one is depended on by us */
2646		for (i = c + 1; i < 4; i++)
2647			/* if we are depended on by a later one */
2648			if (rdep[m[c]] & (1 << m[i]))
2649				break;
2650		if (i == 4)
2651			continue;
2652		/* now, swap */
2653		x = m[c];
2654		m[c] = m[c + 1];
2655		m[c + 1] = x;
2656
2657		/* restart */
2658		c = 0;
2659	}
2660
2661	/* mark dependencies that could not be resolved by reordering */
2662	for (i = 0; i < 3; ++i)
2663		for (c = i + 1; c < 4; ++c)
2664			if (rdep[m[i]] & (1 << m[c]))
2665				unsafe |= (1 << i);
2666
2667	/* NOTE: $unsafe is with respect to order, not component */
2668	return unsafe;
2669}
2670
2671/* Select a suitable dst register for broadcasting scalar results,
2672 * or return NULL if we have to allocate an extra TEMP.
2673 *
2674 * If e.g. only 1 component is written, we may also emit the final
2675 * result to a write-only register.
2676 */
2677static struct nv50_reg *
2678tgsi_broadcast_dst(struct nv50_pc *pc,
2679		   const struct tgsi_full_dst_register *fd, unsigned mask)
2680{
2681	if (fd->Register.File == TGSI_FILE_TEMPORARY) {
2682		int c = ffs(~mask & fd->Register.WriteMask);
2683		if (c)
2684			return tgsi_dst(pc, c - 1, fd);
2685	} else {
2686		int c = ffs(fd->Register.WriteMask) - 1;
2687		if ((1 << c) == fd->Register.WriteMask)
2688			return tgsi_dst(pc, c, fd);
2689	}
2690
2691	return NULL;
2692}
2693
2694/* Scan source swizzles and return a bitmask indicating dst regs that
2695 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2696 */
2697static unsigned
2698nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2699		       unsigned rdep[4])
2700{
2701	const struct tgsi_full_dst_register *fd = &insn->Dst[0];
2702	const struct tgsi_full_src_register *fs;
2703	unsigned i, deqs = 0;
2704
2705	for (i = 0; i < 4; ++i)
2706		rdep[i] = 0;
2707
2708	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2709		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2710		boolean neg_supp = negate_supported(insn, i);
2711
2712		fs = &insn->Src[i];
2713		if (fs->Register.File != fd->Register.File ||
2714		    fs->Register.Index != fd->Register.Index)
2715			continue;
2716
2717		for (chn = 0; chn < 4; ++chn) {
2718			unsigned s, c;
2719
2720			if (!(mask & (1 << chn))) /* src is not read */
2721				continue;
2722			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2723			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2724
2725			if (!(fd->Register.WriteMask & (1 << c)))
2726				continue;
2727
2728			/* no danger if src is copied to TEMP first */
2729			if ((s != TGSI_UTIL_SIGN_KEEP) &&
2730			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2731				continue;
2732
2733			rdep[c] |= nv50_tgsi_dst_revdep(
2734				insn->Instruction.Opcode, i, chn);
2735			deqs |= (1 << c);
2736		}
2737	}
2738
2739	return deqs;
2740}
2741
2742static boolean
2743nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2744{
2745	struct tgsi_full_instruction insn = tok->FullInstruction;
2746	const struct tgsi_full_dst_register *fd;
2747	unsigned i, deqs, rdep[4], m[4];
2748
2749	fd = &tok->FullInstruction.Dst[0];
2750	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2751
2752	if (is_scalar_op(insn.Instruction.Opcode)) {
2753		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2754		if (!pc->r_brdc)
2755			pc->r_brdc = temp_temp(pc);
2756		return nv50_program_tx_insn(pc, &insn);
2757	}
2758	pc->r_brdc = NULL;
2759
2760	if (!deqs)
2761		return nv50_program_tx_insn(pc, &insn);
2762
2763	deqs = nv50_revdep_reorder(m, rdep);
2764
2765	for (i = 0; i < 4; ++i) {
2766		assert(pc->r_dst[m[i]] == NULL);
2767
2768		insn.Dst[0].Register.WriteMask =
2769			fd->Register.WriteMask & (1 << m[i]);
2770
2771		if (!insn.Dst[0].Register.WriteMask)
2772			continue;
2773
2774		if (deqs & (1 << i))
2775			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2776
2777		if (!nv50_program_tx_insn(pc, &insn))
2778			return FALSE;
2779	}
2780
2781	for (i = 0; i < 4; i++) {
2782		struct nv50_reg *reg = pc->r_dst[i];
2783		if (!reg)
2784			continue;
2785		pc->r_dst[i] = NULL;
2786
2787		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2788			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2789		else
2790			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2791		free_temp(pc, reg);
2792	}
2793
2794	return TRUE;
2795}
2796
2797static void
2798load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2799{
2800	struct nv50_reg *iv, **ppiv;
2801	unsigned mode = pc->interp_mode[reg->index];
2802
2803	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2804	iv = *ppiv;
2805
2806	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2807		iv = *ppiv = alloc_temp(pc, NULL);
2808		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2809
2810		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2811		emit_flop(pc, NV50_FLOP_RCP, iv, iv);
2812
2813		/* XXX: when loading interpolants dynamically, move these
2814		 * to the program head, or make sure it can't be skipped.
2815		 */
2816	}
2817
2818	emit_interp(pc, reg, iv, mode);
2819}
2820
2821/* The face input is always at v[255] (varying space), with a
2822 * value of 0 for back-facing, and 0xffffffff for front-facing.
2823 */
2824static void
2825load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
2826{
2827	struct nv50_reg *one = alloc_immd(pc, 1.0f);
2828
2829	assert(a->rhw == -1);
2830	alloc_reg(pc, a); /* do this before rhw is set */
2831	a->rhw = 255;
2832	load_interpolant(pc, a);
2833	emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
2834
2835	FREE(one);
2836}
2837
2838static boolean
2839nv50_program_tx_prep(struct nv50_pc *pc)
2840{
2841	struct tgsi_parse_context tp;
2842	struct nv50_program *p = pc->p;
2843	boolean ret = FALSE;
2844	unsigned i, c, flat_nr = 0;
2845
2846	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2847	while (!tgsi_parse_end_of_tokens(&tp)) {
2848		const union tgsi_full_token *tok = &tp.FullToken;
2849
2850		tgsi_parse_token(&tp);
2851		switch (tok->Token.Type) {
2852		case TGSI_TOKEN_TYPE_IMMEDIATE:
2853		{
2854			const struct tgsi_full_immediate *imm =
2855				&tp.FullToken.FullImmediate;
2856
2857			ctor_immd_4f32(pc, imm->u[0].Float,
2858				       imm->u[1].Float,
2859				       imm->u[2].Float,
2860				       imm->u[3].Float);
2861		}
2862			break;
2863		case TGSI_TOKEN_TYPE_DECLARATION:
2864		{
2865			const struct tgsi_full_declaration *d;
2866			unsigned si, last, first, mode;
2867
2868			d = &tp.FullToken.FullDeclaration;
2869			first = d->Range.First;
2870			last = d->Range.Last;
2871
2872			switch (d->Declaration.File) {
2873			case TGSI_FILE_TEMPORARY:
2874				break;
2875			case TGSI_FILE_OUTPUT:
2876				if (!d->Declaration.Semantic ||
2877				    p->type == PIPE_SHADER_FRAGMENT)
2878					break;
2879
2880				si = d->Semantic.Index;
2881				switch (d->Semantic.Name) {
2882				case TGSI_SEMANTIC_BCOLOR:
2883					p->cfg.two_side[si].hw = first;
2884					if (p->cfg.io_nr > first)
2885						p->cfg.io_nr = first;
2886					break;
2887				case TGSI_SEMANTIC_PSIZE:
2888					p->cfg.psiz = first;
2889					if (p->cfg.io_nr > first)
2890						p->cfg.io_nr = first;
2891					break;
2892				case TGSI_SEMANTIC_EDGEFLAG:
2893					pc->edgeflag_out = first;
2894					break;
2895					/*
2896				case TGSI_SEMANTIC_CLIP_DISTANCE:
2897					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2898					break;
2899					*/
2900				default:
2901					break;
2902				}
2903				break;
2904			case TGSI_FILE_INPUT:
2905			{
2906				if (p->type != PIPE_SHADER_FRAGMENT)
2907					break;
2908
2909				switch (d->Declaration.Interpolate) {
2910				case TGSI_INTERPOLATE_CONSTANT:
2911					mode = INTERP_FLAT;
2912					flat_nr++;
2913					break;
2914				case TGSI_INTERPOLATE_PERSPECTIVE:
2915					mode = INTERP_PERSPECTIVE;
2916					p->cfg.regs[1] |= 0x08 << 24;
2917					break;
2918				default:
2919					mode = INTERP_LINEAR;
2920					break;
2921				}
2922				if (d->Declaration.Centroid)
2923					mode |= INTERP_CENTROID;
2924
2925				assert(last < 32);
2926				for (i = first; i <= last; i++)
2927					pc->interp_mode[i] = mode;
2928			}
2929				break;
2930			case TGSI_FILE_ADDRESS:
2931			case TGSI_FILE_CONSTANT:
2932			case TGSI_FILE_SAMPLER:
2933				break;
2934			default:
2935				NOUVEAU_ERR("bad decl file %d\n",
2936					    d->Declaration.File);
2937				goto out_err;
2938			}
2939		}
2940			break;
2941		case TGSI_TOKEN_TYPE_INSTRUCTION:
2942			pc->insn_nr++;
2943			prep_inspect_insn(pc, &tok->FullInstruction);
2944			break;
2945		default:
2946			break;
2947		}
2948	}
2949
2950	if (p->type == PIPE_SHADER_VERTEX) {
2951		int rid = 0;
2952
2953		for (i = 0; i < pc->attr_nr * 4; ++i) {
2954			if (pc->attr[i].acc) {
2955				pc->attr[i].hw = rid++;
2956				p->cfg.attr[i / 32] |= 1 << (i % 32);
2957			}
2958		}
2959
2960		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2961			p->cfg.io[i].hw = rid;
2962			p->cfg.io[i].id = i;
2963
2964			for (c = 0; c < 4; ++c) {
2965				int n = i * 4 + c;
2966				if (!pc->result[n].acc)
2967					continue;
2968				pc->result[n].hw = rid++;
2969				p->cfg.io[i].mask |= 1 << c;
2970			}
2971		}
2972
2973		for (c = 0; c < 2; ++c)
2974			if (p->cfg.two_side[c].hw < 0x40)
2975				p->cfg.two_side[c] = p->cfg.io[
2976					p->cfg.two_side[c].hw];
2977
2978		if (p->cfg.psiz < 0x40)
2979			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2980	} else
2981	if (p->type == PIPE_SHADER_FRAGMENT) {
2982		int rid, aid;
2983		unsigned n = 0, m = pc->attr_nr - flat_nr;
2984
2985		pc->allow32 = TRUE;
2986
2987		int base = (TGSI_SEMANTIC_POSITION ==
2988			    p->info.input_semantic_name[0]) ? 0 : 1;
2989
2990		/* non-flat interpolants have to be mapped to
2991		 * the lower hardware IDs, so sort them:
2992		 */
2993		for (i = 0; i < pc->attr_nr; i++) {
2994			if (pc->interp_mode[i] == INTERP_FLAT)
2995				p->cfg.io[m++].id = i;
2996			else {
2997				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2998					p->cfg.io[n].linear = TRUE;
2999				p->cfg.io[n++].id = i;
3000			}
3001		}
3002
3003		if (!base) /* set w-coordinate mask from perspective interp */
3004			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
3005
3006		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
3007			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
3008
3009		for (n = 0; n < pc->attr_nr; ++n) {
3010			p->cfg.io[n].hw = rid = aid;
3011			i = p->cfg.io[n].id;
3012
3013			if (p->info.input_semantic_name[n] ==
3014			    TGSI_SEMANTIC_FACE) {
3015				load_frontfacing(pc, &pc->attr[i * 4]);
3016				continue;
3017			}
3018
3019			for (c = 0; c < 4; ++c) {
3020				if (!pc->attr[i * 4 + c].acc)
3021					continue;
3022				pc->attr[i * 4 + c].rhw = rid++;
3023				p->cfg.io[n].mask |= 1 << c;
3024
3025				load_interpolant(pc, &pc->attr[i * 4 + c]);
3026			}
3027			aid += popcnt4(p->cfg.io[n].mask);
3028		}
3029
3030		if (!base)
3031			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
3032
3033		m = popcnt4(p->cfg.regs[1] >> 24);
3034
3035		/* set count of non-position inputs and of non-flat
3036		 * non-position inputs for FP_INTERPOLANT_CTRL
3037		 */
3038		p->cfg.regs[1] |= aid - m;
3039
3040		if (flat_nr) {
3041			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
3042			p->cfg.regs[1] |= (i - m) << 16;
3043		} else
3044			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
3045
3046		/* mark color semantic for light-twoside */
3047		n = 0x40;
3048		for (i = 0; i < pc->attr_nr; i++) {
3049			ubyte si, sn;
3050
3051			sn = p->info.input_semantic_name[p->cfg.io[i].id];
3052			si = p->info.input_semantic_index[p->cfg.io[i].id];
3053
3054			if (sn == TGSI_SEMANTIC_COLOR) {
3055				p->cfg.two_side[si] = p->cfg.io[i];
3056
3057				/* increase colour count */
3058				p->cfg.regs[0] += popcnt4(
3059					p->cfg.two_side[si].mask) << 16;
3060
3061				n = MIN2(n, p->cfg.io[i].hw - m);
3062			}
3063		}
3064		if (n < 0x40)
3065			p->cfg.regs[0] += n;
3066
3067		/* Initialize FP results:
3068		 * FragDepth is always first TGSI and last hw output
3069		 */
3070		i = p->info.writes_z ? 4 : 0;
3071		for (rid = 0; i < pc->result_nr * 4; i++)
3072			pc->result[i].rhw = rid++;
3073		if (p->info.writes_z)
3074			pc->result[2].rhw = rid;
3075
3076		p->cfg.high_result = rid;
3077
3078		/* separate/different colour results for MRTs ? */
3079		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
3080			p->cfg.regs[2] |= 1;
3081	}
3082
3083	if (pc->immd_nr) {
3084		int rid = 0;
3085
3086		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
3087		if (!pc->immd)
3088			goto out_err;
3089
3090		for (i = 0; i < pc->immd_nr; i++) {
3091			for (c = 0; c < 4; c++, rid++)
3092				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
3093		}
3094	}
3095
3096	ret = TRUE;
3097out_err:
3098	if (pc->iv_p)
3099		free_temp(pc, pc->iv_p);
3100	if (pc->iv_c)
3101		free_temp(pc, pc->iv_c);
3102
3103	tgsi_parse_free(&tp);
3104	return ret;
3105}
3106
3107static void
3108free_nv50_pc(struct nv50_pc *pc)
3109{
3110	if (pc->immd)
3111		FREE(pc->immd);
3112	if (pc->param)
3113		FREE(pc->param);
3114	if (pc->result)
3115		FREE(pc->result);
3116	if (pc->attr)
3117		FREE(pc->attr);
3118	if (pc->temp)
3119		FREE(pc->temp);
3120
3121	FREE(pc);
3122}
3123
3124static boolean
3125ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
3126{
3127	int i, c;
3128	unsigned rtype[2] = { P_ATTR, P_RESULT };
3129
3130	pc->p = p;
3131	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
3132	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
3133	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
3134	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
3135	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
3136	assert(pc->addr_nr <= 2);
3137
3138	p->cfg.high_temp = 4;
3139
3140	p->cfg.two_side[0].hw = 0x40;
3141	p->cfg.two_side[1].hw = 0x40;
3142
3143	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff;
3144
3145	switch (p->type) {
3146	case PIPE_SHADER_VERTEX:
3147		p->cfg.psiz = 0x40;
3148		p->cfg.clpd = 0x40;
3149		p->cfg.io_nr = pc->result_nr;
3150		break;
3151	case PIPE_SHADER_FRAGMENT:
3152		rtype[0] = rtype[1] = P_TEMP;
3153
3154		p->cfg.regs[0] = 0x01000004;
3155		p->cfg.io_nr = pc->attr_nr;
3156
3157		if (p->info.writes_z) {
3158			p->cfg.regs[2] |= 0x00000100;
3159			p->cfg.regs[3] |= 0x00000011;
3160		}
3161		if (p->info.uses_kill)
3162			p->cfg.regs[2] |= 0x00100000;
3163		break;
3164	}
3165
3166	if (pc->temp_nr) {
3167		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
3168		if (!pc->temp)
3169			return FALSE;
3170
3171		for (i = 0; i < pc->temp_nr * 4; ++i)
3172			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
3173	}
3174
3175	if (pc->attr_nr) {
3176		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
3177		if (!pc->attr)
3178			return FALSE;
3179
3180		for (i = 0; i < pc->attr_nr * 4; ++i)
3181			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
3182	}
3183
3184	if (pc->result_nr) {
3185		unsigned nr = pc->result_nr * 4;
3186
3187		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
3188		if (!pc->result)
3189			return FALSE;
3190
3191		for (i = 0; i < nr; ++i)
3192			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
3193	}
3194
3195	if (pc->param_nr) {
3196		int rid = 0;
3197
3198		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
3199		if (!pc->param)
3200			return FALSE;
3201
3202		for (i = 0; i < pc->param_nr; ++i)
3203			for (c = 0; c < 4; ++c, ++rid)
3204				ctor_reg(&pc->param[rid], P_CONST, i, rid);
3205	}
3206
3207	if (pc->addr_nr) {
3208		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
3209		if (!pc->addr)
3210			return FALSE;
3211	}
3212	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
3213		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
3214
3215	return TRUE;
3216}
3217
3218static void
3219nv50_program_fixup_insns(struct nv50_pc *pc)
3220{
3221	struct nv50_program_exec *e, **bra_list;
3222	unsigned i, n, pos;
3223
3224	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
3225
3226	/* Collect branch instructions, we need to adjust their offsets
3227	 * when converting 32 bit instructions to 64 bit ones
3228	 */
3229	for (n = 0, e = pc->p->exec_head; e; e = e->next)
3230		if (e->param.index >= 0 && !e->param.mask)
3231			bra_list[n++] = e;
3232
3233	/* last instruction must be long so it can have the exit bit set */
3234	if (!is_long(pc->p->exec_tail))
3235		convert_to_long(pc, pc->p->exec_tail);
3236	/* set exit bit */
3237	pc->p->exec_tail->inst[1] |= 1;
3238
3239	/* !immd on exit insn simultaneously means !join */
3240	assert(!is_immd(pc->p->exec_head));
3241	assert(!is_immd(pc->p->exec_tail));
3242
3243	/* Make sure we don't have any single 32 bit instructions. */
3244	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
3245		pos += is_long(e) ? 2 : 1;
3246
3247		if ((pos & 1) && (!e->next || is_long(e->next))) {
3248			for (i = 0; i < n; ++i)
3249				if (bra_list[i]->param.index >= pos)
3250					bra_list[i]->param.index += 1;
3251			convert_to_long(pc, e);
3252			++pos;
3253		}
3254	}
3255
3256	FREE(bra_list);
3257}
3258
3259static boolean
3260nv50_program_tx(struct nv50_program *p)
3261{
3262	struct tgsi_parse_context parse;
3263	struct nv50_pc *pc;
3264	boolean ret;
3265
3266	pc = CALLOC_STRUCT(nv50_pc);
3267	if (!pc)
3268		return FALSE;
3269
3270	ret = ctor_nv50_pc(pc, p);
3271	if (ret == FALSE)
3272		goto out_cleanup;
3273
3274	ret = nv50_program_tx_prep(pc);
3275	if (ret == FALSE)
3276		goto out_cleanup;
3277
3278	tgsi_parse_init(&parse, pc->p->pipe.tokens);
3279	while (!tgsi_parse_end_of_tokens(&parse)) {
3280		const union tgsi_full_token *tok = &parse.FullToken;
3281
3282		/* don't allow half insn/immd on first and last instruction */
3283		pc->allow32 = TRUE;
3284		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
3285			pc->allow32 = FALSE;
3286
3287		tgsi_parse_token(&parse);
3288
3289		switch (tok->Token.Type) {
3290		case TGSI_TOKEN_TYPE_INSTRUCTION:
3291			++pc->insn_cur;
3292			ret = nv50_tgsi_insn(pc, tok);
3293			if (ret == FALSE)
3294				goto out_err;
3295			break;
3296		default:
3297			break;
3298		}
3299	}
3300
3301	if (pc->p->type == PIPE_SHADER_FRAGMENT)
3302		nv50_fp_move_results(pc);
3303
3304	nv50_program_fixup_insns(pc);
3305
3306	p->param_nr = pc->param_nr * 4;
3307	p->immd_nr = pc->immd_nr * 4;
3308	p->immd = pc->immd_buf;
3309
3310out_err:
3311	tgsi_parse_free(&parse);
3312
3313out_cleanup:
3314	free_nv50_pc(pc);
3315	return ret;
3316}
3317
3318static void
3319nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
3320{
3321	if (nv50_program_tx(p) == FALSE)
3322		assert(0);
3323	p->translated = TRUE;
3324}
3325
3326static void
3327nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map,
3328			unsigned start, unsigned count, unsigned cbuf)
3329{
3330	struct nouveau_channel *chan = nv50->screen->base.channel;
3331	struct nouveau_grobj *tesla = nv50->screen->tesla;
3332
3333	while (count) {
3334		unsigned nr = count > 2047 ? 2047 : count;
3335
3336		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
3337		OUT_RING  (chan, (cbuf << 0) | (start << 8));
3338		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
3339		OUT_RINGp (chan, map, nr);
3340
3341		map += nr;
3342		start += nr;
3343		count -= nr;
3344	}
3345}
3346
3347static void
3348nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
3349{
3350	struct pipe_screen *pscreen = nv50->pipe.screen;
3351
3352	if (!p->data[0] && p->immd_nr) {
3353		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
3354
3355		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
3356			while (heap->next && heap->size < p->immd_nr) {
3357				struct nv50_program *evict = heap->next->priv;
3358				nouveau_resource_free(&evict->data[0]);
3359			}
3360
3361			if (nouveau_resource_alloc(heap, p->immd_nr, p,
3362						   &p->data[0]))
3363				assert(0);
3364		}
3365
3366		/* immediates only need to be uploaded again when freed */
3367		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
3368					 p->immd_nr, NV50_CB_PMISC);
3369	}
3370
3371	assert(p->param_nr <= 512);
3372
3373	if (p->param_nr) {
3374		unsigned cb;
3375		uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
3376						PIPE_BUFFER_USAGE_CPU_READ);
3377
3378		if (p->type == PIPE_SHADER_VERTEX)
3379			cb = NV50_CB_PVP;
3380		else
3381			cb = NV50_CB_PFP;
3382
3383		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
3384		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
3385	}
3386}
3387
3388static void
3389nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
3390{
3391	struct nouveau_channel *chan = nv50->screen->base.channel;
3392	struct nv50_program_exec *e;
3393	uint32_t *up, i;
3394	boolean upload = FALSE;
3395
3396	if (!p->bo) {
3397		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
3398			       p->exec_size * 4, &p->bo);
3399		upload = TRUE;
3400	}
3401
3402	if (p->data[0] && p->data[0]->start != p->data_start[0])
3403		upload = TRUE;
3404
3405	if (!upload)
3406		return;
3407
3408	up = MALLOC(p->exec_size * 4);
3409
3410	for (i = 0, e = p->exec_head; e; e = e->next) {
3411		unsigned ei, ci, bs;
3412
3413		if (e->param.index >= 0 && e->param.mask) {
3414			bs = (e->inst[1] >> 22) & 0x07;
3415			assert(bs < 2);
3416			ei = e->param.shift >> 5;
3417			ci = e->param.index;
3418			if (bs == 0)
3419				ci += p->data[bs]->start;
3420
3421			e->inst[ei] &= ~e->param.mask;
3422			e->inst[ei] |= (ci << e->param.shift);
3423		} else
3424		if (e->param.index >= 0) {
3425			/* zero mask means param is a jump/branch offset */
3426			assert(!(e->param.index & 1));
3427			/* seem to be 8 byte steps */
3428			ei = (e->param.index >> 1) + 0 /* START_ID */;
3429
3430			e->inst[0] &= 0xf0000fff;
3431			e->inst[0] |= ei << 12;
3432		}
3433
3434		up[i++] = e->inst[0];
3435		if (is_long(e))
3436			up[i++] = e->inst[1];
3437	}
3438	assert(i == p->exec_size);
3439
3440	if (p->data[0])
3441		p->data_start[0] = p->data[0]->start;
3442
3443#ifdef NV50_PROGRAM_DUMP
3444	NOUVEAU_ERR("-------\n");
3445	for (e = p->exec_head; e; e = e->next) {
3446		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3447		if (is_long(e))
3448			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3449	}
3450#endif
3451	nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
3452			 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
3453			 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
3454			 0, 0, p->exec_size * 4, 1, 1);
3455
3456	FREE(up);
3457}
3458
3459void
3460nv50_vertprog_validate(struct nv50_context *nv50)
3461{
3462	struct nouveau_grobj *tesla = nv50->screen->tesla;
3463	struct nv50_program *p = nv50->vertprog;
3464	struct nouveau_stateobj *so;
3465
3466	if (!p->translated) {
3467		nv50_program_validate(nv50, p);
3468		if (!p->translated)
3469			assert(0);
3470	}
3471
3472	nv50_program_validate_data(nv50, p);
3473	nv50_program_validate_code(nv50, p);
3474
3475	so = so_new(13, 2);
3476	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3477	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3478		      NOUVEAU_BO_HIGH, 0, 0);
3479	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3480		      NOUVEAU_BO_LOW, 0, 0);
3481	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3482	so_data  (so, p->cfg.attr[0]);
3483	so_data  (so, p->cfg.attr[1]);
3484	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3485	so_data  (so, p->cfg.high_result);
3486	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3487	so_data  (so, p->cfg.high_result); //8);
3488	so_data  (so, p->cfg.high_temp);
3489	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3490	so_data  (so, 0); /* program start offset */
3491	so_ref(so, &nv50->state.vertprog);
3492	so_ref(NULL, &so);
3493}
3494
3495void
3496nv50_fragprog_validate(struct nv50_context *nv50)
3497{
3498	struct nouveau_grobj *tesla = nv50->screen->tesla;
3499	struct nv50_program *p = nv50->fragprog;
3500	struct nouveau_stateobj *so;
3501
3502	if (!p->translated) {
3503		nv50_program_validate(nv50, p);
3504		if (!p->translated)
3505			assert(0);
3506	}
3507
3508	nv50_program_validate_data(nv50, p);
3509	nv50_program_validate_code(nv50, p);
3510
3511	so = so_new(64, 2);
3512	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3513	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3514		      NOUVEAU_BO_HIGH, 0, 0);
3515	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3516		      NOUVEAU_BO_LOW, 0, 0);
3517	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3518	so_data  (so, p->cfg.high_temp);
3519	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3520	so_data  (so, p->cfg.high_result);
3521	so_method(so, tesla, NV50TCL_FP_CONTROL, 1);
3522	so_data  (so, p->cfg.regs[2]);
3523	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3524	so_data  (so, p->cfg.regs[3]);
3525	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3526	so_data  (so, 0); /* program start offset */
3527	so_ref(so, &nv50->state.fragprog);
3528	so_ref(NULL, &so);
3529}
3530
3531static void
3532nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3533{
3534	struct nv50_program *fp = nv50->fragprog;
3535	struct nv50_program *vp = nv50->vertprog;
3536	unsigned i, c, m = base;
3537
3538	/* XXX: this might not work correctly in all cases yet - we'll
3539	 * just assume that an FP generic input that is not written in
3540	 * the VP is PointCoord.
3541	 */
3542	memset(pntc, 0, 8 * sizeof(uint32_t));
3543
3544	for (i = 0; i < fp->cfg.io_nr; i++) {
3545		uint8_t sn, si;
3546		uint8_t j, k = fp->cfg.io[i].id;
3547		unsigned n = popcnt4(fp->cfg.io[i].mask);
3548
3549		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3550			m += n;
3551			continue;
3552		}
3553
3554		for (j = 0; j < vp->info.num_outputs; ++j) {
3555			sn = vp->info.output_semantic_name[j];
3556			si = vp->info.output_semantic_index[j];
3557
3558			if (sn == fp->info.input_semantic_name[k] &&
3559			    si == fp->info.input_semantic_index[k])
3560				break;
3561		}
3562
3563		if (j < vp->info.num_outputs) {
3564			ubyte mode =
3565				nv50->rasterizer->pipe.sprite_coord_mode[si];
3566
3567			if (mode == PIPE_SPRITE_COORD_NONE) {
3568				m += n;
3569				continue;
3570			}
3571		}
3572
3573		/* this is either PointCoord or replaced by sprite coords */
3574		for (c = 0; c < 4; c++) {
3575			if (!(fp->cfg.io[i].mask & (1 << c)))
3576				continue;
3577			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3578			++m;
3579		}
3580	}
3581}
3582
3583static int
3584nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3585	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3586{
3587	int c;
3588	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3589	uint8_t *map = (uint8_t *)p_map;
3590
3591	for (c = 0; c < 4; ++c) {
3592		if (mf & 1) {
3593			if (fpi->linear == TRUE)
3594				lin[mid / 32] |= 1 << (mid % 32);
3595			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3596		}
3597
3598		oid += mv & 1;
3599		mf >>= 1;
3600		mv >>= 1;
3601	}
3602
3603	return mid;
3604}
3605
3606void
3607nv50_linkage_validate(struct nv50_context *nv50)
3608{
3609	struct nouveau_grobj *tesla = nv50->screen->tesla;
3610	struct nv50_program *vp = nv50->vertprog;
3611	struct nv50_program *fp = nv50->fragprog;
3612	struct nouveau_stateobj *so;
3613	struct nv50_sreg4 dummy, *vpo;
3614	int i, n, c, m = 0;
3615	uint32_t map[16], lin[4], reg[5], pcrd[8];
3616
3617	memset(map, 0, sizeof(map));
3618	memset(lin, 0, sizeof(lin));
3619
3620	reg[1] = 0x00000004; /* low and high clip distance map ids */
3621	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3622	reg[3] = 0x00000000; /* point size map id & enable */
3623	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3624	reg[4] = fp->cfg.regs[1]; /* interpolant info */
3625
3626	dummy.linear = FALSE;
3627	dummy.mask = 0xf; /* map all components of HPOS */
3628	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3629
3630	dummy.mask = 0x0;
3631
3632	if (vp->cfg.clpd < 0x40) {
3633		for (c = 0; c < vp->cfg.clpd_nr; ++c)
3634			map[m++] = vp->cfg.clpd + c;
3635		reg[1] = (m << 8);
3636	}
3637
3638	reg[0] |= m << 8; /* adjust BFC0 id */
3639
3640	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3641	if (nv50->rasterizer->pipe.light_twoside) {
3642		vpo = &vp->cfg.two_side[0];
3643
3644		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3645		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3646	}
3647
3648	reg[0] += m - 4; /* adjust FFC0 id */
3649	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3650
3651	for (i = 0; i < fp->cfg.io_nr; i++) {
3652		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id];
3653		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id];
3654
3655		/* position must be mapped first */
3656		assert(i == 0 || sn != TGSI_SEMANTIC_POSITION);
3657
3658		/* maybe even remove these from cfg.io */
3659		if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE)
3660			continue;
3661
3662		/* VP outputs and vp->cfg.io are in the same order */
3663		for (n = 0; n < vp->info.num_outputs; ++n) {
3664			if (vp->info.output_semantic_name[n] == sn &&
3665			    vp->info.output_semantic_index[n] == si)
3666				break;
3667		}
3668		vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy;
3669
3670		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3671	}
3672
3673	if (nv50->rasterizer->pipe.point_size_per_vertex) {
3674		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3675		reg[3] = (m++ << 4) | 1;
3676	}
3677
3678	/* now fill the stateobj */
3679	so = so_new(64, 0);
3680
3681	n = (m + 3) / 4;
3682	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3683	so_data  (so, m);
3684	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3685	so_datap (so, map, n);
3686
3687	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3688	so_datap (so, reg, 4);
3689
3690	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3691	so_data  (so, reg[4]);
3692
3693	so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
3694	so_datap (so, lin, 4);
3695
3696	if (nv50->rasterizer->pipe.point_sprite) {
3697		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3698
3699		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3700		so_datap (so, pcrd, 8);
3701	}
3702
3703        so_ref(so, &nv50->state.programs);
3704        so_ref(NULL, &so);
3705}
3706
3707void
3708nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3709{
3710	while (p->exec_head) {
3711		struct nv50_program_exec *e = p->exec_head;
3712
3713		p->exec_head = e->next;
3714		FREE(e);
3715	}
3716	p->exec_tail = NULL;
3717	p->exec_size = 0;
3718
3719	nouveau_bo_ref(NULL, &p->bo);
3720
3721	nouveau_resource_free(&p->data[0]);
3722
3723	p->translated = 0;
3724}
3725