nv50_program.c revision 44d8c9add2f095fc365ede751253d9fb7fc5c6e1
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 127
35#define NV50_SU_MAX_ADDR 4
36//#define NV50_PROGRAM_DUMP
37
38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40/* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * 	- Fuck it off, introduce a way to negate args for ops that
44 * 	  support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * 	ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * 	FP attr/result assignment - how?
61 * 		attrib
62 * 			- 0x16bc maps vp output onto fp hpos
63 * 			- 0x16c0 maps vp output onto fp col0
64 * 		result
65 * 			- colr always 0-3
66 * 			- depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * 	      "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * 	- 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * 	- XX == FP high something
78 */
79struct nv50_reg {
80	enum {
81		P_TEMP,
82		P_ATTR,
83		P_RESULT,
84		P_CONST,
85		P_IMMD,
86		P_ADDR
87	} type;
88	int index;
89
90	int hw;
91	int mod;
92
93	int rhw; /* result hw for FP outputs, or interpolant index */
94	int acc; /* instruction where this reg is last read (first insn == 1) */
95};
96
97#define NV50_MOD_NEG 1
98#define NV50_MOD_ABS 2
99#define NV50_MOD_SAT 4
100
101/* arbitrary limits */
102#define MAX_IF_DEPTH 4
103#define MAX_LOOP_DEPTH 4
104
105struct nv50_pc {
106	struct nv50_program *p;
107
108	/* hw resources */
109	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
110	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
111
112	/* tgsi resources */
113	struct nv50_reg *temp;
114	int temp_nr;
115	struct nv50_reg *attr;
116	int attr_nr;
117	struct nv50_reg *result;
118	int result_nr;
119	struct nv50_reg *param;
120	int param_nr;
121	struct nv50_reg *immd;
122	float *immd_buf;
123	int immd_nr;
124	struct nv50_reg **addr;
125	int addr_nr;
126
127	struct nv50_reg *temp_temp[16];
128	unsigned temp_temp_nr;
129
130	/* broadcast and destination replacement regs */
131	struct nv50_reg *r_brdc;
132	struct nv50_reg *r_dst[4];
133
134	struct nv50_reg reg_instances[16];
135	unsigned reg_instance_nr;
136
137	unsigned interp_mode[32];
138	/* perspective interpolation registers */
139	struct nv50_reg *iv_p;
140	struct nv50_reg *iv_c;
141
142	struct nv50_program_exec *if_cond;
143	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
144	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
145	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
146	int if_lvl, loop_lvl;
147	unsigned loop_pos[MAX_LOOP_DEPTH];
148
149	/* current instruction and total number of insns */
150	unsigned insn_cur;
151	unsigned insn_nr;
152
153	boolean allow32;
154};
155
156static INLINE struct nv50_reg *
157reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
158{
159	struct nv50_reg *dup = NULL;
160	if (reg) {
161		assert(pc->reg_instance_nr < 16);
162		dup = &pc->reg_instances[pc->reg_instance_nr++];
163		*dup = *reg;
164		reg->mod = 0;
165	}
166	return dup;
167}
168
169static INLINE void
170ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
171{
172	reg->type = type;
173	reg->index = index;
174	reg->hw = hw;
175	reg->mod = 0;
176	reg->rhw = -1;
177	reg->acc = 0;
178}
179
180static INLINE unsigned
181popcnt4(uint32_t val)
182{
183	static const unsigned cnt[16]
184	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
185	return cnt[val & 0xf];
186}
187
188static void
189terminate_mbb(struct nv50_pc *pc)
190{
191	int i;
192
193	/* remove records of temporary address register values */
194	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
195		if (pc->r_addr[i].index < 0)
196			pc->r_addr[i].rhw = -1;
197}
198
199static void
200alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
201{
202	int i = 0;
203
204	if (reg->type == P_RESULT) {
205		if (pc->p->cfg.high_result < (reg->hw + 1))
206			pc->p->cfg.high_result = reg->hw + 1;
207	}
208
209	if (reg->type != P_TEMP)
210		return;
211
212	if (reg->hw >= 0) {
213		/*XXX: do this here too to catch FP temp-as-attr usage..
214		 *     not clean, but works */
215		if (pc->p->cfg.high_temp < (reg->hw + 1))
216			pc->p->cfg.high_temp = reg->hw + 1;
217		return;
218	}
219
220	if (reg->rhw != -1) {
221		/* try to allocate temporary with index rhw first */
222		if (!(pc->r_temp[reg->rhw])) {
223			pc->r_temp[reg->rhw] = reg;
224			reg->hw = reg->rhw;
225			if (pc->p->cfg.high_temp < (reg->rhw + 1))
226				pc->p->cfg.high_temp = reg->rhw + 1;
227			return;
228		}
229		/* make sure we don't get things like $r0 needs to go
230		 * in $r1 and $r1 in $r0
231		 */
232		i = pc->result_nr * 4;
233	}
234
235	for (; i < NV50_SU_MAX_TEMP; i++) {
236		if (!(pc->r_temp[i])) {
237			pc->r_temp[i] = reg;
238			reg->hw = i;
239			if (pc->p->cfg.high_temp < (i + 1))
240				pc->p->cfg.high_temp = i + 1;
241			return;
242		}
243	}
244
245	assert(0);
246}
247
248/* XXX: For shaders that aren't executed linearly (e.g. shaders that
249 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
250 * lest we risk temp_temps overwriting regs alloc'd "later".
251 */
252static struct nv50_reg *
253alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
254{
255	struct nv50_reg *r;
256	int i;
257
258	if (dst && dst->type == P_TEMP && dst->hw == -1)
259		return dst;
260
261	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
262		if (!pc->r_temp[i]) {
263			r = MALLOC_STRUCT(nv50_reg);
264			ctor_reg(r, P_TEMP, -1, i);
265			pc->r_temp[i] = r;
266			return r;
267		}
268	}
269
270	assert(0);
271	return NULL;
272}
273
274/* Assign the hw of the discarded temporary register src
275 * to the tgsi register dst and free src.
276 */
277static void
278assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
279{
280	assert(src->index == -1 && src->hw != -1);
281
282	if (dst->hw != -1)
283		pc->r_temp[dst->hw] = NULL;
284	pc->r_temp[src->hw] = dst;
285	dst->hw = src->hw;
286
287	FREE(src);
288}
289
290/* release the hardware resource held by r */
291static void
292release_hw(struct nv50_pc *pc, struct nv50_reg *r)
293{
294	assert(r->type == P_TEMP);
295	if (r->hw == -1)
296		return;
297
298	assert(pc->r_temp[r->hw] == r);
299	pc->r_temp[r->hw] = NULL;
300
301	r->acc = 0;
302	if (r->index == -1)
303		FREE(r);
304}
305
306static void
307free_temp(struct nv50_pc *pc, struct nv50_reg *r)
308{
309	if (r->index == -1) {
310		unsigned hw = r->hw;
311
312		FREE(pc->r_temp[hw]);
313		pc->r_temp[hw] = NULL;
314	}
315}
316
317static int
318alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
319{
320	int i;
321
322	if ((idx + 4) >= NV50_SU_MAX_TEMP)
323		return 1;
324
325	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
326	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
327		return alloc_temp4(pc, dst, idx + 4);
328
329	for (i = 0; i < 4; i++) {
330		dst[i] = MALLOC_STRUCT(nv50_reg);
331		ctor_reg(dst[i], P_TEMP, -1, idx + i);
332		pc->r_temp[idx + i] = dst[i];
333	}
334
335	return 0;
336}
337
338static void
339free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
340{
341	int i;
342
343	for (i = 0; i < 4; i++)
344		free_temp(pc, reg[i]);
345}
346
347static struct nv50_reg *
348temp_temp(struct nv50_pc *pc)
349{
350	if (pc->temp_temp_nr >= 16)
351		assert(0);
352
353	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
354	return pc->temp_temp[pc->temp_temp_nr++];
355}
356
357static void
358kill_temp_temp(struct nv50_pc *pc)
359{
360	int i;
361
362	for (i = 0; i < pc->temp_temp_nr; i++)
363		free_temp(pc, pc->temp_temp[i]);
364	pc->temp_temp_nr = 0;
365}
366
367static int
368ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
369{
370	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
371			       (pc->immd_nr + 1) * 4 * sizeof(float));
372	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
373	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
374	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
375	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
376
377	return pc->immd_nr++;
378}
379
380static struct nv50_reg *
381alloc_immd(struct nv50_pc *pc, float f)
382{
383	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
384	unsigned hw;
385
386	for (hw = 0; hw < pc->immd_nr * 4; hw++)
387		if (pc->immd_buf[hw] == f)
388			break;
389
390	if (hw == pc->immd_nr * 4)
391		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
392
393	ctor_reg(r, P_IMMD, -1, hw);
394	return r;
395}
396
397static struct nv50_program_exec *
398exec(struct nv50_pc *pc)
399{
400	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
401
402	e->param.index = -1;
403	return e;
404}
405
406static void
407emit(struct nv50_pc *pc, struct nv50_program_exec *e)
408{
409	struct nv50_program *p = pc->p;
410
411	if (p->exec_tail)
412		p->exec_tail->next = e;
413	if (!p->exec_head)
414		p->exec_head = e;
415	p->exec_tail = e;
416	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
417}
418
419static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
420
421static boolean
422is_long(struct nv50_program_exec *e)
423{
424	if (e->inst[0] & 1)
425		return TRUE;
426	return FALSE;
427}
428
429static boolean
430is_immd(struct nv50_program_exec *e)
431{
432	if (is_long(e) && (e->inst[1] & 3) == 3)
433		return TRUE;
434	return FALSE;
435}
436
437static INLINE void
438set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
439	 struct nv50_program_exec *e)
440{
441	set_long(pc, e);
442	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
443	e->inst[1] |= (pred << 7) | (idx << 12);
444}
445
446static INLINE void
447set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
448	    struct nv50_program_exec *e)
449{
450	set_long(pc, e);
451	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
452	e->inst[1] |= (idx << 4) | (on << 6);
453}
454
455static INLINE void
456set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
457{
458	if (is_long(e))
459		return;
460
461	e->inst[0] |= 1;
462	set_pred(pc, 0xf, 0, e);
463	set_pred_wr(pc, 0, 0, e);
464}
465
466static INLINE void
467set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
468{
469	if (dst->type == P_RESULT) {
470		set_long(pc, e);
471		e->inst[1] |= 0x00000008;
472	}
473
474	alloc_reg(pc, dst);
475	if (dst->hw > 63)
476		set_long(pc, e);
477	e->inst[0] |= (dst->hw << 2);
478}
479
480static INLINE void
481set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
482{
483	unsigned val;
484	float f = pc->immd_buf[imm->hw];
485
486	if (imm->mod & NV50_MOD_ABS)
487		f = fabsf(f);
488	val = fui((imm->mod & NV50_MOD_NEG) ? -f : f);
489
490	set_long(pc, e);
491	/*XXX: can't be predicated - bits overlap.. catch cases where both
492	 *     are required and avoid them. */
493	set_pred(pc, 0, 0, e);
494	set_pred_wr(pc, 0, 0, e);
495
496	e->inst[1] |= 0x00000002 | 0x00000001;
497	e->inst[0] |= (val & 0x3f) << 16;
498	e->inst[1] |= (val >> 6) << 2;
499}
500
501static INLINE void
502set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
503{
504	assert(!(e->inst[0] & 0x0c000000));
505	assert(!(e->inst[1] & 0x00000004));
506
507	e->inst[0] |= (a->hw & 3) << 26;
508	e->inst[1] |= (a->hw >> 2) << 2;
509}
510
511static void
512emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
513		  struct nv50_reg *src0, uint16_t src1_val)
514{
515	struct nv50_program_exec *e = exec(pc);
516
517	e->inst[0] = 0xd0000000 | (src1_val << 9);
518	e->inst[1] = 0x20000000;
519	set_long(pc, e);
520	e->inst[0] |= dst->hw << 2;
521	if (src0) /* otherwise will add to $a0, which is always 0 */
522		set_addr(e, src0);
523
524	emit(pc, e);
525}
526
527static struct nv50_reg *
528alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
529{
530	int i;
531	struct nv50_reg *a_tgsi = NULL, *a = NULL;
532
533	if (!ref) {
534		/* allocate for TGSI address reg */
535		for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
536			if (pc->r_addr[i].index >= 0)
537				continue;
538			if (pc->r_addr[i].rhw >= 0 &&
539			    pc->r_addr[i].acc == pc->insn_cur)
540				continue;
541
542			pc->r_addr[i].rhw = -1;
543			pc->r_addr[i].index = i;
544			return &pc->r_addr[i];
545		}
546		assert(0);
547		return NULL;
548	}
549
550	/* Allocate and set an address reg so we can access 'ref'.
551	 *
552	 * If and r_addr has index < 0, it is not reserved for TGSI,
553	 * and index will be the negative of the TGSI addr index the
554	 * value in rhw is relative to, or -256 if rhw is an offset
555	 * from 0. If rhw < 0, the reg has not been initialized.
556	 */
557	for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
558		if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
559			continue;
560		if (pc->r_addr[i].rhw < 0) { /* unused */
561			a = &pc->r_addr[i];
562			continue;
563		}
564		if (!a && pc->r_addr[i].acc != pc->insn_cur)
565			a = &pc->r_addr[i];
566
567		if (ref->hw - pc->r_addr[i].rhw >= 128)
568			continue;
569
570		if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
571		    (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
572			pc->r_addr[i].acc = pc->insn_cur;
573			return &pc->r_addr[i];
574		}
575	}
576	assert(a);
577
578	if (ref->acc < 0)
579		a_tgsi = pc->addr[ref->index];
580
581	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
582
583	a->rhw = ref->hw & ~0x7f;
584	a->acc = pc->insn_cur;
585	a->index = a_tgsi ? -ref->index : -256;
586	return a;
587}
588
589#define INTERP_LINEAR		0
590#define INTERP_FLAT		1
591#define INTERP_PERSPECTIVE	2
592#define INTERP_CENTROID		4
593
594/* interpolant index has been stored in dst->rhw */
595static void
596emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
597		unsigned mode)
598{
599	assert(dst->rhw != -1);
600	struct nv50_program_exec *e = exec(pc);
601
602	e->inst[0] |= 0x80000000;
603	set_dst(pc, dst, e);
604	e->inst[0] |= (dst->rhw << 16);
605
606	if (mode & INTERP_FLAT) {
607		e->inst[0] |= (1 << 8);
608	} else {
609		if (mode & INTERP_PERSPECTIVE) {
610			e->inst[0] |= (1 << 25);
611			alloc_reg(pc, iv);
612			e->inst[0] |= (iv->hw << 9);
613		}
614
615		if (mode & INTERP_CENTROID)
616			e->inst[0] |= (1 << 24);
617	}
618
619	emit(pc, e);
620}
621
622static void
623set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
624	 struct nv50_program_exec *e)
625{
626	set_long(pc, e);
627
628	e->param.index = src->hw & 127;
629	e->param.shift = s;
630	e->param.mask = m << (s % 32);
631
632	if (src->hw > 127)
633		set_addr(e, alloc_addr(pc, src));
634	else
635	if (src->acc < 0) {
636		assert(src->type == P_CONST);
637		set_addr(e, pc->addr[src->index]);
638	}
639
640	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
641}
642
643static void
644emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
645{
646	struct nv50_program_exec *e = exec(pc);
647
648	e->inst[0] = 0x10000000;
649	if (!pc->allow32)
650		set_long(pc, e);
651
652	set_dst(pc, dst, e);
653
654	if (!is_long(e) && src->type == P_IMMD) {
655		set_immd(pc, src, e);
656		/*XXX: 32-bit, but steals part of "half" reg space - need to
657		 *     catch and handle this case if/when we do half-regs
658		 */
659	} else
660	if (src->type == P_IMMD || src->type == P_CONST) {
661		set_long(pc, e);
662		set_data(pc, src, 0x7f, 9, e);
663		e->inst[1] |= 0x20000000; /* mov from c[] */
664	} else {
665		if (src->type == P_ATTR) {
666			set_long(pc, e);
667			e->inst[1] |= 0x00200000;
668		}
669
670		alloc_reg(pc, src);
671		if (src->hw > 63)
672			set_long(pc, e);
673		e->inst[0] |= (src->hw << 9);
674	}
675
676	if (is_long(e) && !is_immd(e)) {
677		e->inst[1] |= 0x04000000; /* 32-bit */
678		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
679		if (!(e->inst[1] & 0x20000000))
680			e->inst[1] |= 0x00030000; /* lane mask 2:3 */
681	} else
682		e->inst[0] |= 0x00008000;
683
684	emit(pc, e);
685}
686
687static INLINE void
688emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
689{
690	struct nv50_reg *imm = alloc_immd(pc, f);
691	emit_mov(pc, dst, imm);
692	FREE(imm);
693}
694
695static void
696emit_nop(struct nv50_pc *pc)
697{
698	struct nv50_program_exec *e = exec(pc);
699
700	e->inst[0] = 0xf0000000;
701	set_long(pc, e);
702	e->inst[1] = 0xe0000000;
703	emit(pc, e);
704}
705
706static boolean
707check_swap_src_0_1(struct nv50_pc *pc,
708		   struct nv50_reg **s0, struct nv50_reg **s1)
709{
710	struct nv50_reg *src0 = *s0, *src1 = *s1;
711
712	if (src0->type == P_CONST) {
713		if (src1->type != P_CONST) {
714			*s0 = src1;
715			*s1 = src0;
716			return TRUE;
717		}
718	} else
719	if (src1->type == P_ATTR) {
720		if (src0->type != P_ATTR) {
721			*s0 = src1;
722			*s1 = src0;
723			return TRUE;
724		}
725	}
726
727	return FALSE;
728}
729
730static void
731set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
732		     struct nv50_program_exec *e)
733{
734	struct nv50_reg *temp;
735
736	if (src->type != P_TEMP) {
737		temp = temp_temp(pc);
738		emit_mov(pc, temp, src);
739		src = temp;
740	}
741
742	alloc_reg(pc, src);
743	if (src->hw > 63)
744		set_long(pc, e);
745	e->inst[0] |= (src->hw << 9);
746}
747
748static void
749set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
750{
751	if (src->type == P_ATTR) {
752		set_long(pc, e);
753		e->inst[1] |= 0x00200000;
754	} else
755	if (src->type == P_CONST || src->type == P_IMMD) {
756		struct nv50_reg *temp = temp_temp(pc);
757
758		emit_mov(pc, temp, src);
759		src = temp;
760	}
761
762	alloc_reg(pc, src);
763	if (src->hw > 63)
764		set_long(pc, e);
765	e->inst[0] |= (src->hw << 9);
766}
767
768static void
769set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
770{
771	if (src->type == P_ATTR) {
772		struct nv50_reg *temp = temp_temp(pc);
773
774		emit_mov(pc, temp, src);
775		src = temp;
776	} else
777	if (src->type == P_CONST || src->type == P_IMMD) {
778		assert(!(e->inst[0] & 0x00800000));
779		if (e->inst[0] & 0x01000000) {
780			struct nv50_reg *temp = temp_temp(pc);
781
782			emit_mov(pc, temp, src);
783			src = temp;
784		} else {
785			set_data(pc, src, 0x7f, 16, e);
786			e->inst[0] |= 0x00800000;
787		}
788	}
789
790	alloc_reg(pc, src);
791	if (src->hw > 63)
792		set_long(pc, e);
793	e->inst[0] |= ((src->hw & 127) << 16);
794}
795
796static void
797set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
798{
799	set_long(pc, e);
800
801	if (src->type == P_ATTR) {
802		struct nv50_reg *temp = temp_temp(pc);
803
804		emit_mov(pc, temp, src);
805		src = temp;
806	} else
807	if (src->type == P_CONST || src->type == P_IMMD) {
808		assert(!(e->inst[0] & 0x01000000));
809		if (e->inst[0] & 0x00800000) {
810			struct nv50_reg *temp = temp_temp(pc);
811
812			emit_mov(pc, temp, src);
813			src = temp;
814		} else {
815			set_data(pc, src, 0x7f, 32+14, e);
816			e->inst[0] |= 0x01000000;
817		}
818	}
819
820	alloc_reg(pc, src);
821	e->inst[1] |= ((src->hw & 127) << 14);
822}
823
824static void
825emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
826{
827	struct nv50_program_exec *e = exec(pc);
828
829	assert(dst->type == P_TEMP);
830	e->inst[1] = 0x20000000 | (pred << 12);
831	set_long(pc, e);
832	set_dst(pc, dst, e);
833
834	emit(pc, e);
835}
836
837static void
838emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
839{
840	struct nv50_program_exec *e = exec(pc);
841
842	e->inst[0] = 0x000001fc;
843	e->inst[1] = 0xa0000008;
844	set_long(pc, e);
845	set_pred_wr(pc, 1, pred, e);
846	set_src_0_restricted(pc, src, e);
847
848	emit(pc, e);
849}
850
851static void
852emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
853	 struct nv50_reg *src1)
854{
855	struct nv50_program_exec *e = exec(pc);
856
857	e->inst[0] |= 0xc0000000;
858
859	if (!pc->allow32)
860		set_long(pc, e);
861
862	check_swap_src_0_1(pc, &src0, &src1);
863	set_dst(pc, dst, e);
864	set_src_0(pc, src0, e);
865	if (src1->type == P_IMMD && !is_long(e)) {
866		if (src0->mod & NV50_MOD_NEG)
867			e->inst[0] |= 0x00008000;
868		set_immd(pc, src1, e);
869	} else {
870		set_src_1(pc, src1, e);
871		if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
872			if (is_long(e))
873				e->inst[1] |= 0x08000000;
874			else
875				e->inst[0] |= 0x00008000;
876		}
877	}
878
879	emit(pc, e);
880}
881
882static void
883emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
884	 struct nv50_reg *src0, struct nv50_reg *src1)
885{
886	struct nv50_program_exec *e = exec(pc);
887
888	e->inst[0] = 0xb0000000;
889
890	alloc_reg(pc, src1);
891	check_swap_src_0_1(pc, &src0, &src1);
892
893	if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
894		set_long(pc, e);
895		e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
896			      ((src1->mod & NV50_MOD_NEG) << 27);
897	}
898
899	set_dst(pc, dst, e);
900	set_src_0(pc, src0, e);
901	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
902		set_src_2(pc, src1, e);
903	else
904	if (src1->type == P_IMMD)
905		set_immd(pc, src1, e);
906	else
907		set_src_1(pc, src1, e);
908
909	emit(pc, e);
910}
911
912static void
913emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
914	 uint8_t s)
915{
916	struct nv50_program_exec *e = exec(pc);
917
918	set_long(pc, e);
919	e->inst[1] |= 0xc0000000;
920
921	e->inst[0] |= dst->hw << 2;
922	e->inst[0] |= s << 16; /* shift left */
923	set_src_0_restricted(pc, src, e);
924
925	emit(pc, e);
926}
927
928static void
929emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
930	    struct nv50_reg *src0, struct nv50_reg *src1)
931{
932	struct nv50_program_exec *e = exec(pc);
933
934	set_long(pc, e);
935	e->inst[0] |= 0xb0000000;
936	e->inst[1] |= (sub << 29);
937
938	check_swap_src_0_1(pc, &src0, &src1);
939	set_dst(pc, dst, e);
940	set_src_0(pc, src0, e);
941	set_src_1(pc, src1, e);
942
943	if (src0->mod & NV50_MOD_ABS)
944		e->inst[1] |= 0x00100000;
945	if (src1->mod & NV50_MOD_ABS)
946		e->inst[1] |= 0x00080000;
947
948	emit(pc, e);
949}
950
951static INLINE void
952emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
953	 struct nv50_reg *src1)
954{
955	src1->mod ^= NV50_MOD_NEG;
956	emit_add(pc, dst, src0, src1);
957	src1->mod ^= NV50_MOD_NEG;
958}
959
960static void
961emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
962	    struct nv50_reg *src1, unsigned op)
963{
964	struct nv50_program_exec *e = exec(pc);
965
966	e->inst[0] = 0xd0000000;
967	set_long(pc, e);
968
969	check_swap_src_0_1(pc, &src0, &src1);
970	set_dst(pc, dst, e);
971	set_src_0(pc, src0, e);
972
973	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
974	    op != TGSI_OPCODE_XOR)
975		assert(!"invalid bit op");
976
977	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
978		set_immd(pc, src1, e);
979		if (op == TGSI_OPCODE_OR)
980			e->inst[0] |= 0x0100;
981		else
982		if (op == TGSI_OPCODE_XOR)
983			e->inst[0] |= 0x8000;
984	} else {
985		set_src_1(pc, src1, e);
986		e->inst[1] |= 0x04000000; /* 32 bit */
987		if (op == TGSI_OPCODE_OR)
988			e->inst[1] |= 0x4000;
989		else
990		if (op == TGSI_OPCODE_XOR)
991			e->inst[1] |= 0x8000;
992	}
993
994	emit(pc, e);
995}
996
997static void
998emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
999	 struct nv50_reg *src1, struct nv50_reg *src2)
1000{
1001	struct nv50_program_exec *e = exec(pc);
1002
1003	e->inst[0] |= 0xe0000000;
1004
1005	check_swap_src_0_1(pc, &src0, &src1);
1006	set_dst(pc, dst, e);
1007	set_src_0(pc, src0, e);
1008	set_src_1(pc, src1, e);
1009	set_src_2(pc, src2, e);
1010
1011	if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
1012		e->inst[1] |= 0x04000000;
1013	if (src2->mod & NV50_MOD_NEG)
1014		e->inst[1] |= 0x08000000;
1015
1016	emit(pc, e);
1017}
1018
1019static INLINE void
1020emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
1021	 struct nv50_reg *src1, struct nv50_reg *src2)
1022{
1023	src2->mod ^= NV50_MOD_NEG;
1024	emit_mad(pc, dst, src0, src1, src2);
1025	src2->mod ^= NV50_MOD_NEG;
1026}
1027
1028static void
1029emit_flop(struct nv50_pc *pc, unsigned sub,
1030	  struct nv50_reg *dst, struct nv50_reg *src)
1031{
1032	struct nv50_program_exec *e = exec(pc);
1033
1034	e->inst[0] |= 0x90000000;
1035	if (sub) {
1036		set_long(pc, e);
1037		e->inst[1] |= (sub << 29);
1038	}
1039
1040	set_dst(pc, dst, e);
1041
1042	if (sub == 0 || sub == 2)
1043		set_src_0_restricted(pc, src, e);
1044	else
1045		set_src_0(pc, src, e);
1046
1047	emit(pc, e);
1048}
1049
1050static void
1051emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1052{
1053	struct nv50_program_exec *e = exec(pc);
1054
1055	e->inst[0] |= 0xb0000000;
1056
1057	set_dst(pc, dst, e);
1058	set_src_0(pc, src, e);
1059	set_long(pc, e);
1060	e->inst[1] |= (6 << 29) | 0x00004000;
1061
1062	emit(pc, e);
1063}
1064
1065static void
1066emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1067{
1068	struct nv50_program_exec *e = exec(pc);
1069
1070	e->inst[0] |= 0xb0000000;
1071
1072	set_dst(pc, dst, e);
1073	set_src_0(pc, src, e);
1074	set_long(pc, e);
1075	e->inst[1] |= (6 << 29);
1076
1077	emit(pc, e);
1078}
1079
1080#define CVTOP_RN	0x01
1081#define CVTOP_FLOOR	0x03
1082#define CVTOP_CEIL	0x05
1083#define CVTOP_TRUNC	0x07
1084#define CVTOP_SAT	0x08
1085#define CVTOP_ABS	0x10
1086
1087/* 0x04 == 32 bit dst */
1088/* 0x40 == dst is float */
1089/* 0x80 == src is float */
1090#define CVT_F32_F32 0xc4
1091#define CVT_F32_S32 0x44
1092#define CVT_S32_F32 0x8c
1093#define CVT_S32_S32 0x0c
1094#define CVT_NEG     0x20
1095#define CVT_RI      0x08
1096
1097static void
1098emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
1099	 int wp, unsigned cvn, unsigned fmt)
1100{
1101	struct nv50_program_exec *e;
1102
1103	e = exec(pc);
1104	set_long(pc, e);
1105
1106	e->inst[0] |= 0xa0000000;
1107	e->inst[1] |= 0x00004000; /* 32 bit src */
1108	e->inst[1] |= (cvn << 16);
1109	e->inst[1] |= (fmt << 24);
1110	set_src_0(pc, src, e);
1111
1112	if (wp >= 0)
1113		set_pred_wr(pc, 1, wp, e);
1114
1115	if (dst)
1116		set_dst(pc, dst, e);
1117	else {
1118		e->inst[0] |= 0x000001fc;
1119		e->inst[1] |= 0x00000008;
1120	}
1121
1122	emit(pc, e);
1123}
1124
1125/* nv50 Condition codes:
1126 *  0x1 = LT
1127 *  0x2 = EQ
1128 *  0x3 = LE
1129 *  0x4 = GT
1130 *  0x5 = NE
1131 *  0x6 = GE
1132 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1133 *  0x8 = unordered bit (allows NaN)
1134 */
1135static void
1136emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1137	 struct nv50_reg *src0, struct nv50_reg *src1)
1138{
1139	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1140
1141	struct nv50_program_exec *e = exec(pc);
1142	struct nv50_reg *rdst;
1143
1144	assert(ccode < 16);
1145	if (check_swap_src_0_1(pc, &src0, &src1))
1146		ccode = cc_swapped[ccode & 7] | (ccode & 8);
1147
1148	rdst = dst;
1149	if (dst && dst->type != P_TEMP)
1150		dst = alloc_temp(pc, NULL);
1151
1152	/* set.u32 */
1153	set_long(pc, e);
1154	e->inst[0] |= 0xb0000000;
1155	e->inst[1] |= 0x60000000 | (ccode << 14);
1156
1157	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1158	 * that doesn't seem to match what the hw actually does
1159	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1160	 */
1161
1162	if (wp >= 0)
1163		set_pred_wr(pc, 1, wp, e);
1164	if (dst)
1165		set_dst(pc, dst, e);
1166	else {
1167		e->inst[0] |= 0x000001fc;
1168		e->inst[1] |= 0x00000008;
1169	}
1170
1171	set_src_0(pc, src0, e);
1172	set_src_1(pc, src1, e);
1173
1174	emit(pc, e);
1175	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
1176
1177	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1178	if (rdst)
1179		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1180	if (rdst && rdst != dst)
1181		free_temp(pc, dst);
1182}
1183
1184static INLINE unsigned
1185map_tgsi_setop_cc(unsigned op)
1186{
1187	switch (op) {
1188	case TGSI_OPCODE_SLT: return 0x1;
1189	case TGSI_OPCODE_SGE: return 0x6;
1190	case TGSI_OPCODE_SEQ: return 0x2;
1191	case TGSI_OPCODE_SGT: return 0x4;
1192	case TGSI_OPCODE_SLE: return 0x3;
1193	case TGSI_OPCODE_SNE: return 0xd;
1194	default:
1195		assert(0);
1196		return 0;
1197	}
1198}
1199
1200static INLINE void
1201emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1202{
1203	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1204}
1205
1206static void
1207emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1208	 struct nv50_reg *v, struct nv50_reg *e)
1209{
1210	struct nv50_reg *temp = alloc_temp(pc, NULL);
1211
1212	emit_flop(pc, 3, temp, v);
1213	emit_mul(pc, temp, temp, e);
1214	emit_preex2(pc, temp, temp);
1215	emit_flop(pc, 6, dst, temp);
1216
1217	free_temp(pc, temp);
1218}
1219
1220static INLINE void
1221emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1222{
1223	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1224}
1225
1226static INLINE void
1227emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1228{
1229	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1230}
1231
1232static void
1233emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1234	 struct nv50_reg **src)
1235{
1236	struct nv50_reg *one = alloc_immd(pc, 1.0);
1237	struct nv50_reg *zero = alloc_immd(pc, 0.0);
1238	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1239	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
1240	struct nv50_reg *tmp[4];
1241	boolean allow32 = pc->allow32;
1242
1243	pc->allow32 = FALSE;
1244
1245	if (mask & (3 << 1)) {
1246		tmp[0] = alloc_temp(pc, NULL);
1247		emit_minmax(pc, 4, tmp[0], src[0], zero);
1248	}
1249
1250	if (mask & (1 << 2)) {
1251		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1252
1253		tmp[1] = temp_temp(pc);
1254		emit_minmax(pc, 4, tmp[1], src[1], zero);
1255
1256		tmp[3] = temp_temp(pc);
1257		emit_minmax(pc, 4, tmp[3], src[3], neg128);
1258		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1259
1260		emit_pow(pc, dst[2], tmp[1], tmp[3]);
1261		emit_mov(pc, dst[2], zero);
1262		set_pred(pc, 3, 0, pc->p->exec_tail);
1263	}
1264
1265	if (mask & (1 << 1))
1266		assimilate_temp(pc, dst[1], tmp[0]);
1267	else
1268	if (mask & (1 << 2))
1269		free_temp(pc, tmp[0]);
1270
1271	pc->allow32 = allow32;
1272
1273	/* do this last, in case src[i,j] == dst[0,3] */
1274	if (mask & (1 << 0))
1275		emit_mov(pc, dst[0], one);
1276
1277	if (mask & (1 << 3))
1278		emit_mov(pc, dst[3], one);
1279
1280	FREE(pos128);
1281	FREE(neg128);
1282	FREE(zero);
1283	FREE(one);
1284}
1285
1286static INLINE void
1287emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1288{
1289	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1290}
1291
1292static void
1293emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1294{
1295	struct nv50_program_exec *e;
1296	const int r_pred = 1;
1297	unsigned cvn = CVT_F32_F32;
1298
1299	if (src->mod & NV50_MOD_NEG)
1300		cvn |= CVT_NEG;
1301	/* write predicate reg */
1302	emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1303
1304	/* conditional discard */
1305	e = exec(pc);
1306	e->inst[0] = 0x00000002;
1307	set_long(pc, e);
1308	set_pred(pc, 0x1 /* LT */, r_pred, e);
1309	emit(pc, e);
1310}
1311
1312static struct nv50_program_exec *
1313emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1314	    struct nv50_program_exec **join)
1315{
1316	struct nv50_program_exec *e = exec(pc);
1317
1318	if (join) {
1319		set_long(pc, e);
1320		e->inst[0] |= 0xa0000002;
1321		emit(pc, e);
1322		*join = e;
1323		e = exec(pc);
1324	}
1325
1326	set_long(pc, e);
1327	e->inst[0] |= 0x10000002;
1328	if (pred >= 0)
1329		set_pred(pc, cc, pred, e);
1330	emit(pc, e);
1331	return pc->p->exec_tail;
1332}
1333
1334#define QOP_ADD 0
1335#define QOP_SUBR 1
1336#define QOP_SUB 2
1337#define QOP_MOV_SRC1 3
1338
1339/* For a quad of threads / top left, top right, bottom left, bottom right
1340 * pixels, do a different operation, and take src0 from a specific thread.
1341 */
1342static void
1343emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
1344	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
1345{
1346       struct nv50_program_exec *e = exec(pc);
1347
1348       e->inst[0] = 0xc0000000;
1349       e->inst[1] = 0x80000000;
1350       set_long(pc, e);
1351       e->inst[0] |= lane_src0 << 16;
1352       set_src_0(pc, src0, e);
1353       set_src_2(pc, src1, e);
1354
1355       if (wp >= 0)
1356	       set_pred_wr(pc, 1, wp, e);
1357
1358       if (dst)
1359	       set_dst(pc, dst, e);
1360       else {
1361	       e->inst[0] |= 0x000001fc;
1362	       e->inst[1] |= 0x00000008;
1363       }
1364
1365       e->inst[0] |= (qop & 3) << 20;
1366       e->inst[1] |= (qop >> 2) << 22;
1367
1368       emit(pc, e);
1369}
1370
1371static void
1372load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1373		     struct nv50_reg **src, unsigned arg, boolean proj)
1374{
1375	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
1376
1377	src[0]->mod |= NV50_MOD_ABS;
1378	src[1]->mod |= NV50_MOD_ABS;
1379	src[2]->mod |= NV50_MOD_ABS;
1380
1381	emit_minmax(pc, 4, t[2], src[0], src[1]);
1382	emit_minmax(pc, 4, t[2], src[2], t[2]);
1383
1384	src[0]->mod = mod[0];
1385	src[1]->mod = mod[1];
1386	src[2]->mod = mod[2];
1387
1388	if (proj && 0 /* looks more correct without this */)
1389		emit_mul(pc, t[2], t[2], src[3]);
1390	else
1391	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
1392		emit_mov(pc, t[3], src[3]);
1393
1394	emit_flop(pc, 0, t[2], t[2]);
1395
1396	emit_mul(pc, t[0], src[0], t[2]);
1397	emit_mul(pc, t[1], src[1], t[2]);
1398	emit_mul(pc, t[2], src[2], t[2]);
1399}
1400
1401static void
1402load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
1403		     struct nv50_reg **src, unsigned dim, unsigned arg)
1404{
1405	unsigned c, mode;
1406
1407	if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1408		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
1409
1410		t[3]->rhw = src[3]->rhw;
1411		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1412		emit_flop(pc, 0, t[3], t[3]);
1413
1414		for (c = 0; c < dim; ++c) {
1415			t[c]->rhw = src[c]->rhw;
1416			emit_interp(pc, t[c], t[3], mode);
1417		}
1418		if (arg != dim) { /* depth reference value */
1419			t[dim]->rhw = src[2]->rhw;
1420			emit_interp(pc, t[dim], t[3], mode);
1421		}
1422	} else {
1423		/* XXX: for some reason the blob sometimes uses MAD
1424		 * (mad f32 $rX $rY $rZ neg $r63)
1425		 */
1426		emit_flop(pc, 0, t[3], src[3]);
1427		for (c = 0; c < dim; ++c)
1428			emit_mul(pc, t[c], src[c], t[3]);
1429		if (arg != dim) /* depth reference value */
1430			emit_mul(pc, t[dim], src[2], t[3]);
1431	}
1432}
1433
1434static INLINE void
1435get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
1436{
1437	switch (type) {
1438	case TGSI_TEXTURE_1D:
1439		*arg = *dim = 1;
1440		break;
1441	case TGSI_TEXTURE_SHADOW1D:
1442		*dim = 1;
1443		*arg = 2;
1444		break;
1445	case TGSI_TEXTURE_UNKNOWN:
1446	case TGSI_TEXTURE_2D:
1447	case TGSI_TEXTURE_RECT:
1448		*arg = *dim = 2;
1449		break;
1450	case TGSI_TEXTURE_SHADOW2D:
1451	case TGSI_TEXTURE_SHADOWRECT:
1452		*dim = 2;
1453		*arg = 3;
1454		break;
1455	case TGSI_TEXTURE_3D:
1456	case TGSI_TEXTURE_CUBE:
1457		*dim = *arg = 3;
1458		break;
1459	default:
1460		assert(0);
1461		break;
1462	}
1463}
1464
1465/* We shouldn't execute TEXLOD if any of the pixels in a quad have
1466 * different LOD values, so branch off groups of equal LOD.
1467 */
1468static void
1469emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
1470		     struct nv50_reg *src, struct nv50_program_exec *tex)
1471{
1472	struct nv50_program_exec *join_at;
1473	unsigned i, target = pc->p->exec_size + 7 * 2;
1474
1475	/* Subtract lod of each pixel from lod of top left pixel, jump
1476	 * texlod insn if result is 0, then repeat for 2 other pixels.
1477	 */
1478	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
1479	emit_branch(pc, 0, 2, &join_at)->param.index = target;
1480
1481	for (i = 1; i < 4; ++i) {
1482		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
1483		emit_branch(pc, 0, 2, NULL)->param.index = target;
1484	}
1485
1486	emit_mov(pc, tlod, src); /* target */
1487	emit(pc, tex); /* texlod */
1488
1489	join_at->param.index = target + 2 * 2;
1490	emit_nop(pc);
1491	pc->p->exec_tail->inst[1] |= 2; /* join _after_ tex */
1492}
1493
1494static void
1495emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
1496		      struct nv50_program_exec *tex)
1497{
1498	struct nv50_program_exec *e;
1499	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
1500	int r_pred = 0;
1501	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
1502
1503	pc->allow32 = FALSE;
1504	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
1505
1506	/* Subtract bias value of thread i from bias values of each thread,
1507	 * store result in r_pred, and set bit i in r_bits if result was 0.
1508	 */
1509	assert(arg < 4);
1510	for (i = 0; i < 4; ++i, ++imm_1248.hw) {
1511		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
1512		emit_mov(pc, r_bits, &imm_1248);
1513		set_pred(pc, 2, r_pred, pc->p->exec_tail);
1514	}
1515	emit_mov_to_pred(pc, r_pred, r_bits);
1516
1517	/* The lanes of a quad are now grouped by the bit in r_pred they have
1518	 * set. Put the input values for TEX into a new register set for each
1519	 * group and execute TEX only for a specific group.
1520	 * We cannot use the same register set for each group because we need
1521	 * the derivatives, which are implicitly calculated, to be correct.
1522	 */
1523	for (i = 1; i < 4; ++i) {
1524		alloc_temp4(pc, t123[i], 0);
1525
1526		for (c = 0; c <= arg; ++c)
1527			emit_mov(pc, t123[i][c], t[c]);
1528
1529		*(e = exec(pc)) = *(tex);
1530		e->inst[0] &= ~0x01fc;
1531		set_dst(pc, t123[i][0], e);
1532		set_pred(pc, cc[i], r_pred, e);
1533		emit(pc, e);
1534	}
1535	/* finally TEX on the original regs (where we kept the input) */
1536	set_pred(pc, cc[0], r_pred, tex);
1537	emit(pc, tex);
1538
1539	/* put the 3 * n other results into regs for lane 0 */
1540	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
1541	for (i = 1; i < 4; ++i) {
1542		for (c = 0; c < n; ++c) {
1543			emit_mov(pc, t[c], t123[i][c]);
1544			set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
1545		}
1546		free_temp4(pc, t123[i]);
1547	}
1548
1549	emit_nop(pc);
1550	free_temp(pc, r_bits);
1551}
1552
1553static void
1554emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1555	 struct nv50_reg **src, unsigned unit, unsigned type,
1556	 boolean proj, int bias_lod)
1557{
1558	struct nv50_reg *t[4];
1559	struct nv50_program_exec *e;
1560	unsigned c, dim, arg;
1561
1562	/* t[i] must be within a single 128 bit super-reg */
1563	alloc_temp4(pc, t, 0);
1564
1565	e = exec(pc);
1566	e->inst[0] = 0xf0000000;
1567	set_long(pc, e);
1568	set_dst(pc, t[0], e);
1569
1570	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
1571	e->inst[0] |= (unit << 9) /* | (unit << 17) */;
1572
1573	/* live flag (don't set if TEX results affect input to another TEX): */
1574	/* e->inst[0] |= 0x00000004; */
1575
1576	get_tex_dim(type, &dim, &arg);
1577
1578	if (type == TGSI_TEXTURE_CUBE) {
1579		e->inst[0] |= 0x08000000;
1580		load_cube_tex_coords(pc, t, src, arg, proj);
1581	} else
1582	if (proj)
1583		load_proj_tex_coords(pc, t, src, dim, arg);
1584	else {
1585		for (c = 0; c < dim; c++)
1586			emit_mov(pc, t[c], src[c]);
1587		if (arg != dim) /* depth reference value (always src.z here) */
1588			emit_mov(pc, t[dim], src[2]);
1589	}
1590
1591	e->inst[0] |= (mask & 0x3) << 25;
1592	e->inst[1] |= (mask & 0xc) << 12;
1593
1594	if (!bias_lod) {
1595		e->inst[0] |= (arg - 1) << 22;
1596		emit(pc, e);
1597	} else
1598	if (bias_lod < 0) {
1599		e->inst[0] |= arg << 22;
1600		e->inst[1] |= 0x20000000; /* texbias */
1601		emit_mov(pc, t[arg], src[3]);
1602		emit_texbias_sequence(pc, t, arg, e);
1603	} else {
1604		e->inst[0] |= arg << 22;
1605		e->inst[1] |= 0x40000000; /* texlod */
1606		emit_mov(pc, t[arg], src[3]);
1607		emit_texlod_sequence(pc, t[arg], src[3], e);
1608	}
1609
1610#if 1
1611	c = 0;
1612	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1613	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1614	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1615	if (mask & 8) emit_mov(pc, dst[3], t[c]);
1616
1617	free_temp4(pc, t);
1618#else
1619	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1620	 * the texture coordinates, not the fetched values: latency ? */
1621
1622	for (c = 0; c < 4; c++) {
1623		if (mask & (1 << c))
1624			assimilate_temp(pc, dst[c], t[c]);
1625		else
1626			free_temp(pc, t[c]);
1627	}
1628#endif
1629}
1630
1631static void
1632emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1633{
1634	struct nv50_program_exec *e = exec(pc);
1635
1636	assert(src->type == P_TEMP);
1637
1638	e->inst[0] = 0xc0140000;
1639	e->inst[1] = 0x89800000;
1640	set_long(pc, e);
1641	set_dst(pc, dst, e);
1642	set_src_0(pc, src, e);
1643	set_src_2(pc, src, e);
1644
1645	emit(pc, e);
1646}
1647
1648static void
1649emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1650{
1651	struct nv50_reg *r = src;
1652	struct nv50_program_exec *e = exec(pc);
1653
1654	assert(src->type == P_TEMP);
1655
1656	if (!(src->mod & NV50_MOD_NEG)) { /* ! double negation */
1657		r = alloc_temp(pc, NULL);
1658		emit_neg(pc, r, src);
1659	}
1660
1661	e->inst[0] = 0xc0150000;
1662	e->inst[1] = 0x8a400000;
1663	set_long(pc, e);
1664	set_dst(pc, dst, e);
1665	set_src_0(pc, r, e);
1666	set_src_2(pc, r, e);
1667
1668	if (r != src)
1669		free_temp(pc, r);
1670
1671	emit(pc, e);
1672}
1673
1674static void
1675convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1676{
1677	unsigned q = 0, m = ~0;
1678
1679	assert(!is_long(e));
1680
1681	switch (e->inst[0] >> 28) {
1682	case 0x1:
1683		/* MOV */
1684		q = 0x0403c000;
1685		m = 0xffff7fff;
1686		break;
1687	case 0x8:
1688		/* INTERP (move centroid, perspective and flat bits) */
1689		m = ~0x03000100;
1690		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1691		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1692		break;
1693	case 0x9:
1694		/* RCP */
1695		break;
1696	case 0xB:
1697		/* ADD */
1698		m = ~(127 << 16);
1699		q = ((e->inst[0] & (~m)) >> 2);
1700		break;
1701	case 0xC:
1702		/* MUL */
1703		m = ~0x00008000;
1704		q = ((e->inst[0] & (~m)) << 12);
1705		break;
1706	case 0xE:
1707		/* MAD (if src2 == dst) */
1708		q = ((e->inst[0] & 0x1fc) << 12);
1709		break;
1710	default:
1711		assert(0);
1712		break;
1713	}
1714
1715	set_long(pc, e);
1716	pc->p->exec_size++;
1717
1718	e->inst[0] &= m;
1719	e->inst[1] |= q;
1720}
1721
1722/* Some operations support an optional negation flag. */
1723static boolean
1724negate_supported(const struct tgsi_full_instruction *insn, int i)
1725{
1726	switch (insn->Instruction.Opcode) {
1727	case TGSI_OPCODE_DDY:
1728	case TGSI_OPCODE_DP3:
1729	case TGSI_OPCODE_DP4:
1730	case TGSI_OPCODE_MUL:
1731	case TGSI_OPCODE_KIL:
1732	case TGSI_OPCODE_ADD:
1733	case TGSI_OPCODE_SUB:
1734	case TGSI_OPCODE_MAD:
1735		return TRUE;
1736	case TGSI_OPCODE_POW:
1737		if (i == 1)
1738			return TRUE;
1739		return FALSE;
1740	default:
1741		return FALSE;
1742	}
1743}
1744
1745/* Return a read mask for source registers deduced from opcode & write mask. */
1746static unsigned
1747nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1748{
1749	unsigned x, mask = insn->Dst[0].Register.WriteMask;
1750
1751	switch (insn->Instruction.Opcode) {
1752	case TGSI_OPCODE_COS:
1753	case TGSI_OPCODE_SIN:
1754		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1755	case TGSI_OPCODE_DP3:
1756		return 0x7;
1757	case TGSI_OPCODE_DP4:
1758	case TGSI_OPCODE_DPH:
1759	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1760		return 0xf;
1761	case TGSI_OPCODE_DST:
1762		return mask & (c ? 0xa : 0x6);
1763	case TGSI_OPCODE_EX2:
1764	case TGSI_OPCODE_LG2:
1765	case TGSI_OPCODE_POW:
1766	case TGSI_OPCODE_RCP:
1767	case TGSI_OPCODE_RSQ:
1768	case TGSI_OPCODE_SCS:
1769		return 0x1;
1770	case TGSI_OPCODE_IF:
1771		return 0x1;
1772	case TGSI_OPCODE_LIT:
1773		return 0xb;
1774	case TGSI_OPCODE_TEX:
1775	case TGSI_OPCODE_TXB:
1776	case TGSI_OPCODE_TXL:
1777	case TGSI_OPCODE_TXP:
1778	{
1779		const struct tgsi_instruction_texture *tex;
1780
1781		assert(insn->Instruction.Texture);
1782		tex = &insn->Texture;
1783
1784		mask = 0x7;
1785		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
1786		    insn->Instruction.Opcode != TGSI_OPCODE_TXD)
1787			mask |= 0x8; /* bias, lod or proj */
1788
1789		switch (tex->Texture) {
1790		case TGSI_TEXTURE_1D:
1791			mask &= 0x9;
1792			break;
1793		case TGSI_TEXTURE_SHADOW1D:
1794			mask &= 0x5;
1795			break;
1796		case TGSI_TEXTURE_2D:
1797			mask &= 0xb;
1798			break;
1799		default:
1800			break;
1801		}
1802	}
1803		return mask;
1804	case TGSI_OPCODE_XPD:
1805		x = 0;
1806		if (mask & 1) x |= 0x6;
1807		if (mask & 2) x |= 0x5;
1808		if (mask & 4) x |= 0x3;
1809		return x;
1810	default:
1811		break;
1812	}
1813
1814	return mask;
1815}
1816
1817static struct nv50_reg *
1818tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1819{
1820	switch (dst->Register.File) {
1821	case TGSI_FILE_TEMPORARY:
1822		return &pc->temp[dst->Register.Index * 4 + c];
1823	case TGSI_FILE_OUTPUT:
1824		return &pc->result[dst->Register.Index * 4 + c];
1825	case TGSI_FILE_ADDRESS:
1826	{
1827		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
1828		if (!r) {
1829			r = alloc_addr(pc, NULL);
1830			pc->addr[dst->Register.Index * 4 + c] = r;
1831		}
1832		assert(r);
1833		return r;
1834	}
1835	case TGSI_FILE_NULL:
1836		return NULL;
1837	default:
1838		break;
1839	}
1840
1841	return NULL;
1842}
1843
1844static struct nv50_reg *
1845tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1846	 boolean neg)
1847{
1848	struct nv50_reg *r = NULL;
1849	struct nv50_reg *temp;
1850	unsigned sgn, c, swz;
1851
1852	if (src->Register.File != TGSI_FILE_CONSTANT)
1853		assert(!src->Register.Indirect);
1854
1855	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1856
1857	c = tgsi_util_get_full_src_register_swizzle(src, chan);
1858	switch (c) {
1859	case TGSI_SWIZZLE_X:
1860	case TGSI_SWIZZLE_Y:
1861	case TGSI_SWIZZLE_Z:
1862	case TGSI_SWIZZLE_W:
1863		switch (src->Register.File) {
1864		case TGSI_FILE_INPUT:
1865			r = &pc->attr[src->Register.Index * 4 + c];
1866			break;
1867		case TGSI_FILE_TEMPORARY:
1868			r = &pc->temp[src->Register.Index * 4 + c];
1869			break;
1870		case TGSI_FILE_CONSTANT:
1871			if (!src->Register.Indirect) {
1872				r = &pc->param[src->Register.Index * 4 + c];
1873				break;
1874			}
1875			/* Indicate indirection by setting r->acc < 0 and
1876			 * use the index field to select the address reg.
1877			 */
1878			r = MALLOC_STRUCT(nv50_reg);
1879			swz = tgsi_util_get_src_register_swizzle(
1880						 &src->Indirect, 0);
1881			ctor_reg(r, P_CONST,
1882				 src->Indirect.Index * 4 + swz,
1883				 src->Register.Index * 4 + c);
1884			r->acc = -1;
1885			break;
1886		case TGSI_FILE_IMMEDIATE:
1887			r = &pc->immd[src->Register.Index * 4 + c];
1888			break;
1889		case TGSI_FILE_SAMPLER:
1890			break;
1891		case TGSI_FILE_ADDRESS:
1892			r = pc->addr[src->Register.Index * 4 + c];
1893			assert(r);
1894			break;
1895		default:
1896			assert(0);
1897			break;
1898		}
1899		break;
1900	default:
1901		assert(0);
1902		break;
1903	}
1904
1905	switch (sgn) {
1906	case TGSI_UTIL_SIGN_KEEP:
1907		break;
1908	case TGSI_UTIL_SIGN_CLEAR:
1909		temp = temp_temp(pc);
1910		emit_abs(pc, temp, r);
1911		r = temp;
1912		break;
1913	case TGSI_UTIL_SIGN_TOGGLE:
1914		if (neg)
1915			r->mod = NV50_MOD_NEG;
1916		else {
1917			temp = temp_temp(pc);
1918			emit_neg(pc, temp, r);
1919			r = temp;
1920		}
1921		break;
1922	case TGSI_UTIL_SIGN_SET:
1923		temp = temp_temp(pc);
1924		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
1925		r = temp;
1926		break;
1927	default:
1928		assert(0);
1929		break;
1930	}
1931
1932	return r;
1933}
1934
1935/* return TRUE for ops that produce only a single result */
1936static boolean
1937is_scalar_op(unsigned op)
1938{
1939	switch (op) {
1940	case TGSI_OPCODE_COS:
1941	case TGSI_OPCODE_DP2:
1942	case TGSI_OPCODE_DP3:
1943	case TGSI_OPCODE_DP4:
1944	case TGSI_OPCODE_DPH:
1945	case TGSI_OPCODE_EX2:
1946	case TGSI_OPCODE_LG2:
1947	case TGSI_OPCODE_POW:
1948	case TGSI_OPCODE_RCP:
1949	case TGSI_OPCODE_RSQ:
1950	case TGSI_OPCODE_SIN:
1951		/*
1952	case TGSI_OPCODE_KIL:
1953	case TGSI_OPCODE_LIT:
1954	case TGSI_OPCODE_SCS:
1955		*/
1956		return TRUE;
1957	default:
1958		return FALSE;
1959	}
1960}
1961
1962/* Returns a bitmask indicating which dst components depend
1963 * on source s, component c (reverse of nv50_tgsi_src_mask).
1964 */
1965static unsigned
1966nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1967{
1968	if (is_scalar_op(op))
1969		return 0x1;
1970
1971	switch (op) {
1972	case TGSI_OPCODE_DST:
1973		return (1 << c) & (s ? 0xa : 0x6);
1974	case TGSI_OPCODE_XPD:
1975		switch (c) {
1976		case 0: return 0x6;
1977		case 1: return 0x5;
1978		case 2: return 0x3;
1979		case 3: return 0x0;
1980		default:
1981			assert(0);
1982			return 0x0;
1983		}
1984	case TGSI_OPCODE_LIT:
1985	case TGSI_OPCODE_SCS:
1986	case TGSI_OPCODE_TEX:
1987	case TGSI_OPCODE_TXB:
1988	case TGSI_OPCODE_TXL:
1989	case TGSI_OPCODE_TXP:
1990		/* these take care of dangerous swizzles themselves */
1991		return 0x0;
1992	case TGSI_OPCODE_IF:
1993	case TGSI_OPCODE_KIL:
1994		/* don't call this function for these ops */
1995		assert(0);
1996		return 0;
1997	default:
1998		/* linear vector instruction */
1999		return (1 << c);
2000	}
2001}
2002
2003static INLINE boolean
2004has_pred(struct nv50_program_exec *e, unsigned cc)
2005{
2006	if (!is_long(e) || is_immd(e))
2007		return FALSE;
2008	return ((e->inst[1] & 0x780) == (cc << 7));
2009}
2010
2011/* on ENDIF see if we can do "@p0.neu single_op" instead of:
2012 *        join_at ENDIF
2013 *        @p0.eq bra ENDIF
2014 *        single_op
2015 * ENDIF: nop.join
2016 */
2017static boolean
2018nv50_kill_branch(struct nv50_pc *pc)
2019{
2020	int lvl = pc->if_lvl;
2021
2022	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
2023		return FALSE;
2024
2025	/* if ccode == 'true', the BRA is from an ELSE and the predicate
2026	 * reg may no longer be valid, since we currently always use $p0
2027	 */
2028	if (has_pred(pc->if_insn[lvl], 0xf))
2029		return FALSE;
2030	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
2031
2032	/* We'll use the exec allocated for JOIN_AT (as we can't easily
2033	 * update prev's next); if exec_tail is BRK, update the pointer.
2034	 */
2035	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
2036		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
2037
2038	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
2039
2040	*pc->br_join[lvl] = *pc->p->exec_tail;
2041
2042	FREE(pc->if_insn[lvl]);
2043	FREE(pc->p->exec_tail);
2044
2045	pc->p->exec_tail = pc->br_join[lvl];
2046	pc->p->exec_tail->next = NULL;
2047	set_pred(pc, 0xd, 0, pc->p->exec_tail);
2048
2049	return TRUE;
2050}
2051
2052static boolean
2053nv50_program_tx_insn(struct nv50_pc *pc,
2054		     const struct tgsi_full_instruction *inst)
2055{
2056	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
2057	unsigned mask, sat, unit;
2058	int i, c;
2059
2060	mask = inst->Dst[0].Register.WriteMask;
2061	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
2062
2063	memset(src, 0, sizeof(src));
2064
2065	for (c = 0; c < 4; c++) {
2066		if ((mask & (1 << c)) && !pc->r_dst[c])
2067			dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
2068		else
2069			dst[c] = pc->r_dst[c];
2070		rdst[c] = dst[c];
2071	}
2072
2073	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2074		const struct tgsi_full_src_register *fs = &inst->Src[i];
2075		unsigned src_mask;
2076		boolean neg_supp;
2077
2078		src_mask = nv50_tgsi_src_mask(inst, i);
2079		neg_supp = negate_supported(inst, i);
2080
2081		if (fs->Register.File == TGSI_FILE_SAMPLER)
2082			unit = fs->Register.Index;
2083
2084		for (c = 0; c < 4; c++)
2085			if (src_mask & (1 << c))
2086				src[i][c] = reg_instance(pc,
2087					tgsi_src(pc, c, fs, neg_supp));
2088	}
2089
2090	brdc = temp = pc->r_brdc;
2091	if (brdc && brdc->type != P_TEMP) {
2092		temp = temp_temp(pc);
2093		if (sat)
2094			brdc = temp;
2095	} else
2096	if (sat) {
2097		for (c = 0; c < 4; c++) {
2098			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
2099				continue;
2100			/* rdst[c] = dst[c]; */ /* done above */
2101			dst[c] = temp_temp(pc);
2102		}
2103	}
2104
2105	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
2106
2107	switch (inst->Instruction.Opcode) {
2108	case TGSI_OPCODE_ABS:
2109		for (c = 0; c < 4; c++) {
2110			if (!(mask & (1 << c)))
2111				continue;
2112			emit_abs(pc, dst[c], src[0][c]);
2113		}
2114		break;
2115	case TGSI_OPCODE_ADD:
2116		for (c = 0; c < 4; c++) {
2117			if (!(mask & (1 << c)))
2118				continue;
2119			emit_add(pc, dst[c], src[0][c], src[1][c]);
2120		}
2121		break;
2122	case TGSI_OPCODE_AND:
2123	case TGSI_OPCODE_XOR:
2124	case TGSI_OPCODE_OR:
2125		for (c = 0; c < 4; c++) {
2126			if (!(mask & (1 << c)))
2127				continue;
2128			emit_bitop2(pc, dst[c], src[0][c], src[1][c],
2129				    inst->Instruction.Opcode);
2130		}
2131		break;
2132	case TGSI_OPCODE_ARL:
2133		assert(src[0][0]);
2134		temp = temp_temp(pc);
2135		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
2136		emit_arl(pc, dst[0], temp, 4);
2137		break;
2138	case TGSI_OPCODE_BGNLOOP:
2139		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
2140		terminate_mbb(pc);
2141		break;
2142	case TGSI_OPCODE_BRK:
2143		emit_branch(pc, -1, 0, NULL);
2144		assert(pc->loop_lvl > 0);
2145		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
2146		break;
2147	case TGSI_OPCODE_CEIL:
2148		for (c = 0; c < 4; c++) {
2149			if (!(mask & (1 << c)))
2150				continue;
2151			emit_cvt(pc, dst[c], src[0][c], -1,
2152				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
2153		}
2154		break;
2155	case TGSI_OPCODE_CMP:
2156		pc->allow32 = FALSE;
2157		for (c = 0; c < 4; c++) {
2158			if (!(mask & (1 << c)))
2159				continue;
2160			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
2161			emit_mov(pc, dst[c], src[1][c]);
2162			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
2163			emit_mov(pc, dst[c], src[2][c]);
2164			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
2165		}
2166		break;
2167	case TGSI_OPCODE_COS:
2168		if (mask & 8) {
2169			emit_precossin(pc, temp, src[0][3]);
2170			emit_flop(pc, 5, dst[3], temp);
2171			if (!(mask &= 7))
2172				break;
2173			if (temp == dst[3])
2174				temp = brdc = temp_temp(pc);
2175		}
2176		emit_precossin(pc, temp, src[0][0]);
2177		emit_flop(pc, 5, brdc, temp);
2178		break;
2179	case TGSI_OPCODE_DDX:
2180		for (c = 0; c < 4; c++) {
2181			if (!(mask & (1 << c)))
2182				continue;
2183			emit_ddx(pc, dst[c], src[0][c]);
2184		}
2185		break;
2186	case TGSI_OPCODE_DDY:
2187		for (c = 0; c < 4; c++) {
2188			if (!(mask & (1 << c)))
2189				continue;
2190			emit_ddy(pc, dst[c], src[0][c]);
2191		}
2192		break;
2193	case TGSI_OPCODE_DP3:
2194		emit_mul(pc, temp, src[0][0], src[1][0]);
2195		emit_mad(pc, temp, src[0][1], src[1][1], temp);
2196		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
2197		break;
2198	case TGSI_OPCODE_DP4:
2199		emit_mul(pc, temp, src[0][0], src[1][0]);
2200		emit_mad(pc, temp, src[0][1], src[1][1], temp);
2201		emit_mad(pc, temp, src[0][2], src[1][2], temp);
2202		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
2203		break;
2204	case TGSI_OPCODE_DPH:
2205		emit_mul(pc, temp, src[0][0], src[1][0]);
2206		emit_mad(pc, temp, src[0][1], src[1][1], temp);
2207		emit_mad(pc, temp, src[0][2], src[1][2], temp);
2208		emit_add(pc, brdc, src[1][3], temp);
2209		break;
2210	case TGSI_OPCODE_DST:
2211		if (mask & (1 << 1))
2212			emit_mul(pc, dst[1], src[0][1], src[1][1]);
2213		if (mask & (1 << 2))
2214			emit_mov(pc, dst[2], src[0][2]);
2215		if (mask & (1 << 3))
2216			emit_mov(pc, dst[3], src[1][3]);
2217		if (mask & (1 << 0))
2218			emit_mov_immdval(pc, dst[0], 1.0f);
2219		break;
2220	case TGSI_OPCODE_ELSE:
2221		emit_branch(pc, -1, 0, NULL);
2222		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2223		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2224		terminate_mbb(pc);
2225		break;
2226	case TGSI_OPCODE_ENDIF:
2227		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
2228
2229		/* try to replace branch over 1 insn with a predicated insn */
2230		if (nv50_kill_branch(pc) == TRUE)
2231			break;
2232
2233		if (pc->br_join[pc->if_lvl]) {
2234			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
2235			pc->br_join[pc->if_lvl] = NULL;
2236		}
2237		terminate_mbb(pc);
2238		/* emit a NOP as join point, we could set it on the next
2239		 * one, but would have to make sure it is long and !immd
2240		 */
2241		emit_nop(pc);
2242		pc->p->exec_tail->inst[1] |= 2;
2243		break;
2244	case TGSI_OPCODE_ENDLOOP:
2245		emit_branch(pc, -1, 0, NULL);
2246		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
2247		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
2248		terminate_mbb(pc);
2249		break;
2250	case TGSI_OPCODE_EX2:
2251		emit_preex2(pc, temp, src[0][0]);
2252		emit_flop(pc, 6, brdc, temp);
2253		break;
2254	case TGSI_OPCODE_FLR:
2255		for (c = 0; c < 4; c++) {
2256			if (!(mask & (1 << c)))
2257				continue;
2258			emit_flr(pc, dst[c], src[0][c]);
2259		}
2260		break;
2261	case TGSI_OPCODE_FRC:
2262		temp = temp_temp(pc);
2263		for (c = 0; c < 4; c++) {
2264			if (!(mask & (1 << c)))
2265				continue;
2266			emit_flr(pc, temp, src[0][c]);
2267			emit_sub(pc, dst[c], src[0][c], temp);
2268		}
2269		break;
2270	case TGSI_OPCODE_IF:
2271		/* emitting a join_at may not be necessary */
2272		assert(pc->if_lvl < MAX_IF_DEPTH);
2273		/* set_pred_wr(pc, 1, 0, pc->if_cond); */
2274		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
2275			 CVT_F32_F32);
2276		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
2277		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2278		terminate_mbb(pc);
2279		break;
2280	case TGSI_OPCODE_KIL:
2281		emit_kil(pc, src[0][0]);
2282		emit_kil(pc, src[0][1]);
2283		emit_kil(pc, src[0][2]);
2284		emit_kil(pc, src[0][3]);
2285		break;
2286	case TGSI_OPCODE_LIT:
2287		emit_lit(pc, &dst[0], mask, &src[0][0]);
2288		break;
2289	case TGSI_OPCODE_LG2:
2290		emit_flop(pc, 3, brdc, src[0][0]);
2291		break;
2292	case TGSI_OPCODE_LRP:
2293		temp = temp_temp(pc);
2294		for (c = 0; c < 4; c++) {
2295			if (!(mask & (1 << c)))
2296				continue;
2297			emit_sub(pc, temp, src[1][c], src[2][c]);
2298			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
2299		}
2300		break;
2301	case TGSI_OPCODE_MAD:
2302		for (c = 0; c < 4; c++) {
2303			if (!(mask & (1 << c)))
2304				continue;
2305			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2306		}
2307		break;
2308	case TGSI_OPCODE_MAX:
2309		for (c = 0; c < 4; c++) {
2310			if (!(mask & (1 << c)))
2311				continue;
2312			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2313		}
2314		break;
2315	case TGSI_OPCODE_MIN:
2316		for (c = 0; c < 4; c++) {
2317			if (!(mask & (1 << c)))
2318				continue;
2319			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2320		}
2321		break;
2322	case TGSI_OPCODE_MOV:
2323		for (c = 0; c < 4; c++) {
2324			if (!(mask & (1 << c)))
2325				continue;
2326			emit_mov(pc, dst[c], src[0][c]);
2327		}
2328		break;
2329	case TGSI_OPCODE_MUL:
2330		for (c = 0; c < 4; c++) {
2331			if (!(mask & (1 << c)))
2332				continue;
2333			emit_mul(pc, dst[c], src[0][c], src[1][c]);
2334		}
2335		break;
2336	case TGSI_OPCODE_POW:
2337		emit_pow(pc, brdc, src[0][0], src[1][0]);
2338		break;
2339	case TGSI_OPCODE_RCP:
2340		emit_flop(pc, 0, brdc, src[0][0]);
2341		break;
2342	case TGSI_OPCODE_RSQ:
2343		emit_flop(pc, 2, brdc, src[0][0]);
2344		break;
2345	case TGSI_OPCODE_SCS:
2346		temp = temp_temp(pc);
2347		if (mask & 3)
2348			emit_precossin(pc, temp, src[0][0]);
2349		if (mask & (1 << 0))
2350			emit_flop(pc, 5, dst[0], temp);
2351		if (mask & (1 << 1))
2352			emit_flop(pc, 4, dst[1], temp);
2353		if (mask & (1 << 2))
2354			emit_mov_immdval(pc, dst[2], 0.0);
2355		if (mask & (1 << 3))
2356			emit_mov_immdval(pc, dst[3], 1.0);
2357		break;
2358	case TGSI_OPCODE_SIN:
2359		if (mask & 8) {
2360			emit_precossin(pc, temp, src[0][3]);
2361			emit_flop(pc, 4, dst[3], temp);
2362			if (!(mask &= 7))
2363				break;
2364			if (temp == dst[3])
2365				temp = brdc = temp_temp(pc);
2366		}
2367		emit_precossin(pc, temp, src[0][0]);
2368		emit_flop(pc, 4, brdc, temp);
2369		break;
2370	case TGSI_OPCODE_SLT:
2371	case TGSI_OPCODE_SGE:
2372	case TGSI_OPCODE_SEQ:
2373	case TGSI_OPCODE_SGT:
2374	case TGSI_OPCODE_SLE:
2375	case TGSI_OPCODE_SNE:
2376		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2377		for (c = 0; c < 4; c++) {
2378			if (!(mask & (1 << c)))
2379				continue;
2380			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2381		}
2382		break;
2383	case TGSI_OPCODE_SUB:
2384		for (c = 0; c < 4; c++) {
2385			if (!(mask & (1 << c)))
2386				continue;
2387			emit_sub(pc, dst[c], src[0][c], src[1][c]);
2388		}
2389		break;
2390	case TGSI_OPCODE_TEX:
2391		emit_tex(pc, dst, mask, src[0], unit,
2392			 inst->Texture.Texture, FALSE, 0);
2393		break;
2394	case TGSI_OPCODE_TXB:
2395		emit_tex(pc, dst, mask, src[0], unit,
2396			 inst->Texture.Texture, FALSE, -1);
2397		break;
2398	case TGSI_OPCODE_TXL:
2399		emit_tex(pc, dst, mask, src[0], unit,
2400			 inst->Texture.Texture, FALSE, 1);
2401		break;
2402	case TGSI_OPCODE_TXP:
2403		emit_tex(pc, dst, mask, src[0], unit,
2404			 inst->Texture.Texture, TRUE, 0);
2405		break;
2406	case TGSI_OPCODE_TRUNC:
2407		for (c = 0; c < 4; c++) {
2408			if (!(mask & (1 << c)))
2409				continue;
2410			emit_cvt(pc, dst[c], src[0][c], -1,
2411				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2412		}
2413		break;
2414	case TGSI_OPCODE_XPD:
2415		temp = temp_temp(pc);
2416		if (mask & (1 << 0)) {
2417			emit_mul(pc, temp, src[0][2], src[1][1]);
2418			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2419		}
2420		if (mask & (1 << 1)) {
2421			emit_mul(pc, temp, src[0][0], src[1][2]);
2422			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2423		}
2424		if (mask & (1 << 2)) {
2425			emit_mul(pc, temp, src[0][1], src[1][0]);
2426			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2427		}
2428		if (mask & (1 << 3))
2429			emit_mov_immdval(pc, dst[3], 1.0);
2430		break;
2431	case TGSI_OPCODE_END:
2432		break;
2433	default:
2434		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2435		return FALSE;
2436	}
2437
2438	if (brdc) {
2439		if (sat)
2440			emit_sat(pc, brdc, brdc);
2441		for (c = 0; c < 4; c++)
2442			if ((mask & (1 << c)) && dst[c] != brdc)
2443				emit_mov(pc, dst[c], brdc);
2444	} else
2445	if (sat) {
2446		for (c = 0; c < 4; c++) {
2447			if (!(mask & (1 << c)))
2448				continue;
2449			/* In this case we saturate later, and dst[c] won't
2450			 * be another temp_temp (and thus lost), since rdst
2451			 * already is TEMP (see above). */
2452			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
2453				continue;
2454			emit_sat(pc, rdst[c], dst[c]);
2455		}
2456	}
2457
2458	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2459		for (c = 0; c < 4; c++) {
2460			if (!src[i][c])
2461				continue;
2462			if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
2463				FREE(src[i][c]); /* indirect constant */
2464		}
2465	}
2466
2467	kill_temp_temp(pc);
2468	pc->reg_instance_nr = 0;
2469
2470	return TRUE;
2471}
2472
2473static void
2474prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2475{
2476	struct nv50_reg *reg = NULL;
2477	const struct tgsi_full_src_register *src;
2478	const struct tgsi_dst_register *dst;
2479	unsigned i, c, k, mask;
2480
2481	dst = &insn->Dst[0].Register;
2482	mask = dst->WriteMask;
2483
2484        if (dst->File == TGSI_FILE_TEMPORARY)
2485                reg = pc->temp;
2486        else
2487        if (dst->File == TGSI_FILE_OUTPUT)
2488                reg = pc->result;
2489
2490	if (reg) {
2491		for (c = 0; c < 4; c++) {
2492			if (!(mask & (1 << c)))
2493				continue;
2494			reg[dst->Index * 4 + c].acc = pc->insn_nr;
2495		}
2496	}
2497
2498	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2499		src = &insn->Src[i];
2500
2501		if (src->Register.File == TGSI_FILE_TEMPORARY)
2502			reg = pc->temp;
2503		else
2504		if (src->Register.File == TGSI_FILE_INPUT)
2505			reg = pc->attr;
2506		else
2507			continue;
2508
2509		mask = nv50_tgsi_src_mask(insn, i);
2510
2511		for (c = 0; c < 4; c++) {
2512			if (!(mask & (1 << c)))
2513				continue;
2514			k = tgsi_util_get_full_src_register_swizzle(src, c);
2515
2516			reg[src->Register.Index * 4 + k].acc = pc->insn_nr;
2517		}
2518	}
2519}
2520
2521/* Returns a bitmask indicating which dst components need to be
2522 * written to temporaries first to avoid 'corrupting' sources.
2523 *
2524 * m[i]   (out) indicate component to write in the i-th position
2525 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2526 */
2527static unsigned
2528nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2529{
2530	unsigned i, c, x, unsafe;
2531
2532	for (c = 0; c < 4; c++)
2533		m[c] = c;
2534
2535	/* Swap as long as a dst component written earlier is depended on
2536	 * by one written later, but the next one isn't depended on by it.
2537	 */
2538	for (c = 0; c < 3; c++) {
2539		if (rdep[m[c + 1]] & (1 << m[c]))
2540			continue; /* if next one is depended on by us */
2541		for (i = c + 1; i < 4; i++)
2542			/* if we are depended on by a later one */
2543			if (rdep[m[c]] & (1 << m[i]))
2544				break;
2545		if (i == 4)
2546			continue;
2547		/* now, swap */
2548		x = m[c];
2549		m[c] = m[c + 1];
2550		m[c + 1] = x;
2551
2552		/* restart */
2553		c = 0;
2554	}
2555
2556	/* mark dependencies that could not be resolved by reordering */
2557	for (i = 0; i < 3; ++i)
2558		for (c = i + 1; c < 4; ++c)
2559			if (rdep[m[i]] & (1 << m[c]))
2560				unsafe |= (1 << i);
2561
2562	/* NOTE: $unsafe is with respect to order, not component */
2563	return unsafe;
2564}
2565
2566/* Select a suitable dst register for broadcasting scalar results,
2567 * or return NULL if we have to allocate an extra TEMP.
2568 *
2569 * If e.g. only 1 component is written, we may also emit the final
2570 * result to a write-only register.
2571 */
2572static struct nv50_reg *
2573tgsi_broadcast_dst(struct nv50_pc *pc,
2574		   const struct tgsi_full_dst_register *fd, unsigned mask)
2575{
2576	if (fd->Register.File == TGSI_FILE_TEMPORARY) {
2577		int c = ffs(~mask & fd->Register.WriteMask);
2578		if (c)
2579			return tgsi_dst(pc, c - 1, fd);
2580	} else {
2581		int c = ffs(fd->Register.WriteMask) - 1;
2582		if ((1 << c) == fd->Register.WriteMask)
2583			return tgsi_dst(pc, c, fd);
2584	}
2585
2586	return NULL;
2587}
2588
2589/* Scan source swizzles and return a bitmask indicating dst regs that
2590 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2591 */
2592static unsigned
2593nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2594		       unsigned rdep[4])
2595{
2596	const struct tgsi_full_dst_register *fd = &insn->Dst[0];
2597	const struct tgsi_full_src_register *fs;
2598	unsigned i, deqs = 0;
2599
2600	for (i = 0; i < 4; ++i)
2601		rdep[i] = 0;
2602
2603	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2604		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2605		boolean neg_supp = negate_supported(insn, i);
2606
2607		fs = &insn->Src[i];
2608		if (fs->Register.File != fd->Register.File ||
2609		    fs->Register.Index != fd->Register.Index)
2610			continue;
2611
2612		for (chn = 0; chn < 4; ++chn) {
2613			unsigned s, c;
2614
2615			if (!(mask & (1 << chn))) /* src is not read */
2616				continue;
2617			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2618			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2619
2620			if (!(fd->Register.WriteMask & (1 << c)))
2621				continue;
2622
2623			/* no danger if src is copied to TEMP first */
2624			if ((s != TGSI_UTIL_SIGN_KEEP) &&
2625			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2626				continue;
2627
2628			rdep[c] |= nv50_tgsi_dst_revdep(
2629				insn->Instruction.Opcode, i, chn);
2630			deqs |= (1 << c);
2631		}
2632	}
2633
2634	return deqs;
2635}
2636
2637static boolean
2638nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2639{
2640	struct tgsi_full_instruction insn = tok->FullInstruction;
2641	const struct tgsi_full_dst_register *fd;
2642	unsigned i, deqs, rdep[4], m[4];
2643
2644	fd = &tok->FullInstruction.Dst[0];
2645	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2646
2647	if (is_scalar_op(insn.Instruction.Opcode)) {
2648		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2649		if (!pc->r_brdc)
2650			pc->r_brdc = temp_temp(pc);
2651		return nv50_program_tx_insn(pc, &insn);
2652	}
2653	pc->r_brdc = NULL;
2654
2655	if (!deqs)
2656		return nv50_program_tx_insn(pc, &insn);
2657
2658	deqs = nv50_revdep_reorder(m, rdep);
2659
2660	for (i = 0; i < 4; ++i) {
2661		assert(pc->r_dst[m[i]] == NULL);
2662
2663		insn.Dst[0].Register.WriteMask =
2664			fd->Register.WriteMask & (1 << m[i]);
2665
2666		if (!insn.Dst[0].Register.WriteMask)
2667			continue;
2668
2669		if (deqs & (1 << i))
2670			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2671
2672		if (!nv50_program_tx_insn(pc, &insn))
2673			return FALSE;
2674	}
2675
2676	for (i = 0; i < 4; i++) {
2677		struct nv50_reg *reg = pc->r_dst[i];
2678		if (!reg)
2679			continue;
2680		pc->r_dst[i] = NULL;
2681
2682		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2683			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2684		else
2685			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2686		free_temp(pc, reg);
2687	}
2688
2689	return TRUE;
2690}
2691
2692static void
2693load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2694{
2695	struct nv50_reg *iv, **ppiv;
2696	unsigned mode = pc->interp_mode[reg->index];
2697
2698	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2699	iv = *ppiv;
2700
2701	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2702		iv = *ppiv = alloc_temp(pc, NULL);
2703		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2704
2705		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2706		emit_flop(pc, 0, iv, iv);
2707
2708		/* XXX: when loading interpolants dynamically, move these
2709		 * to the program head, or make sure it can't be skipped.
2710		 */
2711	}
2712
2713	emit_interp(pc, reg, iv, mode);
2714}
2715
2716/* The face input is always at v[255] (varying space), with a
2717 * value of 0 for back-facing, and 0xffffffff for front-facing.
2718 */
2719static void
2720load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
2721{
2722	struct nv50_reg *one = alloc_immd(pc, 1.0f);
2723
2724	assert(a->rhw == -1);
2725	alloc_reg(pc, a); /* do this before rhw is set */
2726	a->rhw = 255;
2727	load_interpolant(pc, a);
2728	emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
2729
2730	FREE(one);
2731}
2732
2733static boolean
2734nv50_program_tx_prep(struct nv50_pc *pc)
2735{
2736	struct tgsi_parse_context tp;
2737	struct nv50_program *p = pc->p;
2738	boolean ret = FALSE;
2739	unsigned i, c, flat_nr = 0;
2740
2741	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2742	while (!tgsi_parse_end_of_tokens(&tp)) {
2743		const union tgsi_full_token *tok = &tp.FullToken;
2744
2745		tgsi_parse_token(&tp);
2746		switch (tok->Token.Type) {
2747		case TGSI_TOKEN_TYPE_IMMEDIATE:
2748		{
2749			const struct tgsi_full_immediate *imm =
2750				&tp.FullToken.FullImmediate;
2751
2752			ctor_immd(pc, imm->u[0].Float,
2753				      imm->u[1].Float,
2754				      imm->u[2].Float,
2755				      imm->u[3].Float);
2756		}
2757			break;
2758		case TGSI_TOKEN_TYPE_DECLARATION:
2759		{
2760			const struct tgsi_full_declaration *d;
2761			unsigned si, last, first, mode;
2762
2763			d = &tp.FullToken.FullDeclaration;
2764			first = d->Range.First;
2765			last = d->Range.Last;
2766
2767			switch (d->Declaration.File) {
2768			case TGSI_FILE_TEMPORARY:
2769				break;
2770			case TGSI_FILE_OUTPUT:
2771				if (!d->Declaration.Semantic ||
2772				    p->type == PIPE_SHADER_FRAGMENT)
2773					break;
2774
2775				si = d->Semantic.Index;
2776				switch (d->Semantic.Name) {
2777				case TGSI_SEMANTIC_BCOLOR:
2778					p->cfg.two_side[si].hw = first;
2779					if (p->cfg.io_nr > first)
2780						p->cfg.io_nr = first;
2781					break;
2782				case TGSI_SEMANTIC_PSIZE:
2783					p->cfg.psiz = first;
2784					if (p->cfg.io_nr > first)
2785						p->cfg.io_nr = first;
2786					break;
2787					/*
2788				case TGSI_SEMANTIC_CLIP_DISTANCE:
2789					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2790					break;
2791					*/
2792				default:
2793					break;
2794				}
2795				break;
2796			case TGSI_FILE_INPUT:
2797			{
2798				if (p->type != PIPE_SHADER_FRAGMENT)
2799					break;
2800
2801				switch (d->Declaration.Interpolate) {
2802				case TGSI_INTERPOLATE_CONSTANT:
2803					mode = INTERP_FLAT;
2804					flat_nr++;
2805					break;
2806				case TGSI_INTERPOLATE_PERSPECTIVE:
2807					mode = INTERP_PERSPECTIVE;
2808					p->cfg.regs[1] |= 0x08 << 24;
2809					break;
2810				default:
2811					mode = INTERP_LINEAR;
2812					break;
2813				}
2814				if (d->Declaration.Centroid)
2815					mode |= INTERP_CENTROID;
2816
2817				assert(last < 32);
2818				for (i = first; i <= last; i++)
2819					pc->interp_mode[i] = mode;
2820			}
2821				break;
2822			case TGSI_FILE_ADDRESS:
2823			case TGSI_FILE_CONSTANT:
2824			case TGSI_FILE_SAMPLER:
2825				break;
2826			default:
2827				NOUVEAU_ERR("bad decl file %d\n",
2828					    d->Declaration.File);
2829				goto out_err;
2830			}
2831		}
2832			break;
2833		case TGSI_TOKEN_TYPE_INSTRUCTION:
2834			pc->insn_nr++;
2835			prep_inspect_insn(pc, &tok->FullInstruction);
2836			break;
2837		default:
2838			break;
2839		}
2840	}
2841
2842	if (p->type == PIPE_SHADER_VERTEX) {
2843		int rid = 0;
2844
2845		for (i = 0; i < pc->attr_nr * 4; ++i) {
2846			if (pc->attr[i].acc) {
2847				pc->attr[i].hw = rid++;
2848				p->cfg.attr[i / 32] |= 1 << (i % 32);
2849			}
2850		}
2851
2852		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2853			p->cfg.io[i].hw = rid;
2854			p->cfg.io[i].id = i;
2855
2856			for (c = 0; c < 4; ++c) {
2857				int n = i * 4 + c;
2858				if (!pc->result[n].acc)
2859					continue;
2860				pc->result[n].hw = rid++;
2861				p->cfg.io[i].mask |= 1 << c;
2862			}
2863		}
2864
2865		for (c = 0; c < 2; ++c)
2866			if (p->cfg.two_side[c].hw < 0x40)
2867				p->cfg.two_side[c] = p->cfg.io[
2868					p->cfg.two_side[c].hw];
2869
2870		if (p->cfg.psiz < 0x40)
2871			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2872	} else
2873	if (p->type == PIPE_SHADER_FRAGMENT) {
2874		int rid, aid;
2875		unsigned n = 0, m = pc->attr_nr - flat_nr;
2876
2877		pc->allow32 = TRUE;
2878
2879		int base = (TGSI_SEMANTIC_POSITION ==
2880			    p->info.input_semantic_name[0]) ? 0 : 1;
2881
2882		/* non-flat interpolants have to be mapped to
2883		 * the lower hardware IDs, so sort them:
2884		 */
2885		for (i = 0; i < pc->attr_nr; i++) {
2886			if (pc->interp_mode[i] == INTERP_FLAT)
2887				p->cfg.io[m++].id = i;
2888			else {
2889				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2890					p->cfg.io[n].linear = TRUE;
2891				p->cfg.io[n++].id = i;
2892			}
2893		}
2894
2895		if (!base) /* set w-coordinate mask from perspective interp */
2896			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2897
2898		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2899			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2900
2901		for (n = 0; n < pc->attr_nr; ++n) {
2902			p->cfg.io[n].hw = rid = aid;
2903			i = p->cfg.io[n].id;
2904
2905			if (p->info.input_semantic_name[n] ==
2906			    TGSI_SEMANTIC_FACE) {
2907				load_frontfacing(pc, &pc->attr[i * 4]);
2908				continue;
2909			}
2910
2911			for (c = 0; c < 4; ++c) {
2912				if (!pc->attr[i * 4 + c].acc)
2913					continue;
2914				pc->attr[i * 4 + c].rhw = rid++;
2915				p->cfg.io[n].mask |= 1 << c;
2916
2917				load_interpolant(pc, &pc->attr[i * 4 + c]);
2918			}
2919			aid += popcnt4(p->cfg.io[n].mask);
2920		}
2921
2922		if (!base)
2923			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2924
2925		m = popcnt4(p->cfg.regs[1] >> 24);
2926
2927		/* set count of non-position inputs and of non-flat
2928		 * non-position inputs for FP_INTERPOLANT_CTRL
2929		 */
2930		p->cfg.regs[1] |= aid - m;
2931
2932		if (flat_nr) {
2933			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2934			p->cfg.regs[1] |= (i - m) << 16;
2935		} else
2936			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2937
2938		/* mark color semantic for light-twoside */
2939		n = 0x40;
2940		for (i = 0; i < pc->attr_nr; i++) {
2941			ubyte si, sn;
2942
2943			sn = p->info.input_semantic_name[p->cfg.io[i].id];
2944			si = p->info.input_semantic_index[p->cfg.io[i].id];
2945
2946			if (sn == TGSI_SEMANTIC_COLOR) {
2947				p->cfg.two_side[si] = p->cfg.io[i];
2948
2949				/* increase colour count */
2950				p->cfg.regs[0] += popcnt4(
2951					p->cfg.two_side[si].mask) << 16;
2952
2953				n = MIN2(n, p->cfg.io[i].hw - m);
2954			}
2955		}
2956		if (n < 0x40)
2957			p->cfg.regs[0] += n;
2958
2959		/* Initialize FP results:
2960		 * FragDepth is always first TGSI and last hw output
2961		 */
2962		i = p->info.writes_z ? 4 : 0;
2963		for (rid = 0; i < pc->result_nr * 4; i++)
2964			pc->result[i].rhw = rid++;
2965		if (p->info.writes_z)
2966			pc->result[2].rhw = rid;
2967
2968		p->cfg.high_result = rid;
2969
2970		/* separate/different colour results for MRTs ? */
2971		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
2972			p->cfg.regs[2] |= 1;
2973	}
2974
2975	if (pc->immd_nr) {
2976		int rid = 0;
2977
2978		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2979		if (!pc->immd)
2980			goto out_err;
2981
2982		for (i = 0; i < pc->immd_nr; i++) {
2983			for (c = 0; c < 4; c++, rid++)
2984				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2985		}
2986	}
2987
2988	ret = TRUE;
2989out_err:
2990	if (pc->iv_p)
2991		free_temp(pc, pc->iv_p);
2992	if (pc->iv_c)
2993		free_temp(pc, pc->iv_c);
2994
2995	tgsi_parse_free(&tp);
2996	return ret;
2997}
2998
2999static void
3000free_nv50_pc(struct nv50_pc *pc)
3001{
3002	if (pc->immd)
3003		FREE(pc->immd);
3004	if (pc->param)
3005		FREE(pc->param);
3006	if (pc->result)
3007		FREE(pc->result);
3008	if (pc->attr)
3009		FREE(pc->attr);
3010	if (pc->temp)
3011		FREE(pc->temp);
3012
3013	FREE(pc);
3014}
3015
3016static boolean
3017ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
3018{
3019	int i, c;
3020	unsigned rtype[2] = { P_ATTR, P_RESULT };
3021
3022	pc->p = p;
3023	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
3024	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
3025	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
3026	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
3027	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
3028	assert(pc->addr_nr <= 2);
3029
3030	p->cfg.high_temp = 4;
3031
3032	p->cfg.two_side[0].hw = 0x40;
3033	p->cfg.two_side[1].hw = 0x40;
3034
3035	switch (p->type) {
3036	case PIPE_SHADER_VERTEX:
3037		p->cfg.psiz = 0x40;
3038		p->cfg.clpd = 0x40;
3039		p->cfg.io_nr = pc->result_nr;
3040		break;
3041	case PIPE_SHADER_FRAGMENT:
3042		rtype[0] = rtype[1] = P_TEMP;
3043
3044		p->cfg.regs[0] = 0x01000004;
3045		p->cfg.io_nr = pc->attr_nr;
3046
3047		if (p->info.writes_z) {
3048			p->cfg.regs[2] |= 0x00000100;
3049			p->cfg.regs[3] |= 0x00000011;
3050		}
3051		if (p->info.uses_kill)
3052			p->cfg.regs[2] |= 0x00100000;
3053		break;
3054	}
3055
3056	if (pc->temp_nr) {
3057		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
3058		if (!pc->temp)
3059			return FALSE;
3060
3061		for (i = 0; i < pc->temp_nr * 4; ++i)
3062			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
3063	}
3064
3065	if (pc->attr_nr) {
3066		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
3067		if (!pc->attr)
3068			return FALSE;
3069
3070		for (i = 0; i < pc->attr_nr * 4; ++i)
3071			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
3072	}
3073
3074	if (pc->result_nr) {
3075		unsigned nr = pc->result_nr * 4;
3076
3077		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
3078		if (!pc->result)
3079			return FALSE;
3080
3081		for (i = 0; i < nr; ++i)
3082			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
3083	}
3084
3085	if (pc->param_nr) {
3086		int rid = 0;
3087
3088		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
3089		if (!pc->param)
3090			return FALSE;
3091
3092		for (i = 0; i < pc->param_nr; ++i)
3093			for (c = 0; c < 4; ++c, ++rid)
3094				ctor_reg(&pc->param[rid], P_CONST, i, rid);
3095	}
3096
3097	if (pc->addr_nr) {
3098		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
3099		if (!pc->addr)
3100			return FALSE;
3101	}
3102	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
3103		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
3104
3105	return TRUE;
3106}
3107
3108static void
3109nv50_fp_move_results(struct nv50_pc *pc)
3110{
3111	struct nv50_reg reg;
3112	unsigned i;
3113
3114	ctor_reg(&reg, P_TEMP, -1, -1);
3115
3116	for (i = 0; i < pc->result_nr * 4; ++i) {
3117		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
3118			continue;
3119		if (pc->result[i].rhw != pc->result[i].hw) {
3120			reg.hw = pc->result[i].rhw;
3121			emit_mov(pc, &reg, &pc->result[i]);
3122		}
3123	}
3124}
3125
3126static void
3127nv50_program_fixup_insns(struct nv50_pc *pc)
3128{
3129	struct nv50_program_exec *e, **bra_list;
3130	unsigned i, n, pos;
3131
3132	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
3133
3134	/* Collect branch instructions, we need to adjust their offsets
3135	 * when converting 32 bit instructions to 64 bit ones
3136	 */
3137	for (n = 0, e = pc->p->exec_head; e; e = e->next)
3138		if (e->param.index >= 0 && !e->param.mask)
3139			bra_list[n++] = e;
3140
3141	/* last instruction must be long so it can have the exit bit set */
3142	if (!is_long(pc->p->exec_tail))
3143		convert_to_long(pc, pc->p->exec_tail);
3144	/* set exit bit */
3145	pc->p->exec_tail->inst[1] |= 1;
3146
3147	/* !immd on exit insn simultaneously means !join */
3148	assert(!is_immd(pc->p->exec_head));
3149	assert(!is_immd(pc->p->exec_tail));
3150
3151	/* Make sure we don't have any single 32 bit instructions. */
3152	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
3153		pos += is_long(e) ? 2 : 1;
3154
3155		if ((pos & 1) && (!e->next || is_long(e->next))) {
3156			for (i = 0; i < n; ++i)
3157				if (bra_list[i]->param.index >= pos)
3158					bra_list[i]->param.index += 1;
3159			convert_to_long(pc, e);
3160			++pos;
3161		}
3162	}
3163
3164	FREE(bra_list);
3165}
3166
3167static boolean
3168nv50_program_tx(struct nv50_program *p)
3169{
3170	struct tgsi_parse_context parse;
3171	struct nv50_pc *pc;
3172	boolean ret;
3173
3174	pc = CALLOC_STRUCT(nv50_pc);
3175	if (!pc)
3176		return FALSE;
3177
3178	ret = ctor_nv50_pc(pc, p);
3179	if (ret == FALSE)
3180		goto out_cleanup;
3181
3182	ret = nv50_program_tx_prep(pc);
3183	if (ret == FALSE)
3184		goto out_cleanup;
3185
3186	tgsi_parse_init(&parse, pc->p->pipe.tokens);
3187	while (!tgsi_parse_end_of_tokens(&parse)) {
3188		const union tgsi_full_token *tok = &parse.FullToken;
3189
3190		/* don't allow half insn/immd on first and last instruction */
3191		pc->allow32 = TRUE;
3192		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
3193			pc->allow32 = FALSE;
3194
3195		tgsi_parse_token(&parse);
3196
3197		switch (tok->Token.Type) {
3198		case TGSI_TOKEN_TYPE_INSTRUCTION:
3199			++pc->insn_cur;
3200			ret = nv50_tgsi_insn(pc, tok);
3201			if (ret == FALSE)
3202				goto out_err;
3203			break;
3204		default:
3205			break;
3206		}
3207	}
3208
3209	if (pc->p->type == PIPE_SHADER_FRAGMENT)
3210		nv50_fp_move_results(pc);
3211
3212	nv50_program_fixup_insns(pc);
3213
3214	p->param_nr = pc->param_nr * 4;
3215	p->immd_nr = pc->immd_nr * 4;
3216	p->immd = pc->immd_buf;
3217
3218out_err:
3219	tgsi_parse_free(&parse);
3220
3221out_cleanup:
3222	free_nv50_pc(pc);
3223	return ret;
3224}
3225
3226static void
3227nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
3228{
3229	if (nv50_program_tx(p) == FALSE)
3230		assert(0);
3231	p->translated = TRUE;
3232}
3233
3234static void
3235nv50_program_upload_data(struct nv50_context *nv50, float *map,
3236			unsigned start, unsigned count, unsigned cbuf)
3237{
3238	struct nouveau_channel *chan = nv50->screen->base.channel;
3239	struct nouveau_grobj *tesla = nv50->screen->tesla;
3240
3241	while (count) {
3242		unsigned nr = count > 2047 ? 2047 : count;
3243
3244		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
3245		OUT_RING  (chan, (cbuf << 0) | (start << 8));
3246		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
3247		OUT_RINGp (chan, map, nr);
3248
3249		map += nr;
3250		start += nr;
3251		count -= nr;
3252	}
3253}
3254
3255static void
3256nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
3257{
3258	struct pipe_screen *pscreen = nv50->pipe.screen;
3259
3260	if (!p->data[0] && p->immd_nr) {
3261		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
3262
3263		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
3264			while (heap->next && heap->size < p->immd_nr) {
3265				struct nv50_program *evict = heap->next->priv;
3266				nouveau_resource_free(&evict->data[0]);
3267			}
3268
3269			if (nouveau_resource_alloc(heap, p->immd_nr, p,
3270						   &p->data[0]))
3271				assert(0);
3272		}
3273
3274		/* immediates only need to be uploaded again when freed */
3275		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
3276					 p->immd_nr, NV50_CB_PMISC);
3277	}
3278
3279	assert(p->param_nr <= 512);
3280
3281	if (p->param_nr) {
3282		unsigned cb;
3283		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
3284					     PIPE_BUFFER_USAGE_CPU_READ);
3285
3286		if (p->type == PIPE_SHADER_VERTEX)
3287			cb = NV50_CB_PVP;
3288		else
3289			cb = NV50_CB_PFP;
3290
3291		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
3292		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
3293	}
3294}
3295
3296static void
3297nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
3298{
3299	struct nouveau_channel *chan = nv50->screen->base.channel;
3300	struct nv50_program_exec *e;
3301	uint32_t *up, i;
3302	boolean upload = FALSE;
3303
3304	if (!p->bo) {
3305		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
3306			       p->exec_size * 4, &p->bo);
3307		upload = TRUE;
3308	}
3309
3310	if (p->data[0] && p->data[0]->start != p->data_start[0])
3311		upload = TRUE;
3312
3313	if (!upload)
3314		return;
3315
3316	up = MALLOC(p->exec_size * 4);
3317
3318	for (i = 0, e = p->exec_head; e; e = e->next) {
3319		unsigned ei, ci, bs;
3320
3321		if (e->param.index >= 0 && e->param.mask) {
3322			bs = (e->inst[1] >> 22) & 0x07;
3323			assert(bs < 2);
3324			ei = e->param.shift >> 5;
3325			ci = e->param.index;
3326			if (bs == 0)
3327				ci += p->data[bs]->start;
3328
3329			e->inst[ei] &= ~e->param.mask;
3330			e->inst[ei] |= (ci << e->param.shift);
3331		} else
3332		if (e->param.index >= 0) {
3333			/* zero mask means param is a jump/branch offset */
3334			assert(!(e->param.index & 1));
3335			/* seem to be 8 byte steps */
3336			ei = (e->param.index >> 1) + 0 /* START_ID */;
3337
3338			e->inst[0] &= 0xf0000fff;
3339			e->inst[0] |= ei << 12;
3340		}
3341
3342		up[i++] = e->inst[0];
3343		if (is_long(e))
3344			up[i++] = e->inst[1];
3345	}
3346	assert(i == p->exec_size);
3347
3348	if (p->data[0])
3349		p->data_start[0] = p->data[0]->start;
3350
3351#ifdef NV50_PROGRAM_DUMP
3352	NOUVEAU_ERR("-------\n");
3353	for (e = p->exec_head; e; e = e->next) {
3354		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3355		if (is_long(e))
3356			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3357	}
3358#endif
3359	nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
3360			 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
3361			 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
3362			 0, 0, p->exec_size * 4, 1, 1);
3363
3364	FREE(up);
3365}
3366
3367void
3368nv50_vertprog_validate(struct nv50_context *nv50)
3369{
3370	struct nouveau_grobj *tesla = nv50->screen->tesla;
3371	struct nv50_program *p = nv50->vertprog;
3372	struct nouveau_stateobj *so;
3373
3374	if (!p->translated) {
3375		nv50_program_validate(nv50, p);
3376		if (!p->translated)
3377			assert(0);
3378	}
3379
3380	nv50_program_validate_data(nv50, p);
3381	nv50_program_validate_code(nv50, p);
3382
3383	so = so_new(13, 2);
3384	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3385	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3386		      NOUVEAU_BO_HIGH, 0, 0);
3387	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3388		      NOUVEAU_BO_LOW, 0, 0);
3389	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3390	so_data  (so, p->cfg.attr[0]);
3391	so_data  (so, p->cfg.attr[1]);
3392	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3393	so_data  (so, p->cfg.high_result);
3394	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3395	so_data  (so, p->cfg.high_result); //8);
3396	so_data  (so, p->cfg.high_temp);
3397	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3398	so_data  (so, 0); /* program start offset */
3399	so_ref(so, &nv50->state.vertprog);
3400	so_ref(NULL, &so);
3401}
3402
3403void
3404nv50_fragprog_validate(struct nv50_context *nv50)
3405{
3406	struct nouveau_grobj *tesla = nv50->screen->tesla;
3407	struct nv50_program *p = nv50->fragprog;
3408	struct nouveau_stateobj *so;
3409
3410	if (!p->translated) {
3411		nv50_program_validate(nv50, p);
3412		if (!p->translated)
3413			assert(0);
3414	}
3415
3416	nv50_program_validate_data(nv50, p);
3417	nv50_program_validate_code(nv50, p);
3418
3419	so = so_new(64, 2);
3420	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3421	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3422		      NOUVEAU_BO_HIGH, 0, 0);
3423	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3424		      NOUVEAU_BO_LOW, 0, 0);
3425	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3426	so_data  (so, p->cfg.high_temp);
3427	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3428	so_data  (so, p->cfg.high_result);
3429	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
3430	so_data  (so, p->cfg.regs[2]);
3431	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3432	so_data  (so, p->cfg.regs[3]);
3433	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3434	so_data  (so, 0); /* program start offset */
3435	so_ref(so, &nv50->state.fragprog);
3436	so_ref(NULL, &so);
3437}
3438
3439static void
3440nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3441{
3442	struct nv50_program *fp = nv50->fragprog;
3443	struct nv50_program *vp = nv50->vertprog;
3444	unsigned i, c, m = base;
3445
3446	/* XXX: this might not work correctly in all cases yet - we'll
3447	 * just assume that an FP generic input that is not written in
3448	 * the VP is PointCoord.
3449	 */
3450	memset(pntc, 0, 8 * sizeof(uint32_t));
3451
3452	for (i = 0; i < fp->cfg.io_nr; i++) {
3453		uint8_t sn, si;
3454		uint8_t j, k = fp->cfg.io[i].id;
3455		unsigned n = popcnt4(fp->cfg.io[i].mask);
3456
3457		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3458			m += n;
3459			continue;
3460		}
3461
3462		for (j = 0; j < vp->info.num_outputs; ++j) {
3463			sn = vp->info.output_semantic_name[j];
3464			si = vp->info.output_semantic_index[j];
3465
3466			if (sn == fp->info.input_semantic_name[k] &&
3467			    si == fp->info.input_semantic_index[k])
3468				break;
3469		}
3470
3471		if (j < vp->info.num_outputs) {
3472			ubyte mode =
3473				nv50->rasterizer->pipe.sprite_coord_mode[si];
3474
3475			if (mode == PIPE_SPRITE_COORD_NONE) {
3476				m += n;
3477				continue;
3478			}
3479		}
3480
3481		/* this is either PointCoord or replaced by sprite coords */
3482		for (c = 0; c < 4; c++) {
3483			if (!(fp->cfg.io[i].mask & (1 << c)))
3484				continue;
3485			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3486			++m;
3487		}
3488	}
3489}
3490
3491static int
3492nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3493	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3494{
3495	int c;
3496	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3497	uint8_t *map = (uint8_t *)p_map;
3498
3499	for (c = 0; c < 4; ++c) {
3500		if (mf & 1) {
3501			if (fpi->linear == TRUE)
3502				lin[mid / 32] |= 1 << (mid % 32);
3503			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3504		}
3505
3506		oid += mv & 1;
3507		mf >>= 1;
3508		mv >>= 1;
3509	}
3510
3511	return mid;
3512}
3513
3514void
3515nv50_linkage_validate(struct nv50_context *nv50)
3516{
3517	struct nouveau_grobj *tesla = nv50->screen->tesla;
3518	struct nv50_program *vp = nv50->vertprog;
3519	struct nv50_program *fp = nv50->fragprog;
3520	struct nouveau_stateobj *so;
3521	struct nv50_sreg4 dummy, *vpo;
3522	int i, n, c, m = 0;
3523	uint32_t map[16], lin[4], reg[5], pcrd[8];
3524
3525	memset(map, 0, sizeof(map));
3526	memset(lin, 0, sizeof(lin));
3527
3528	reg[1] = 0x00000004; /* low and high clip distance map ids */
3529	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3530	reg[3] = 0x00000000; /* point size map id & enable */
3531	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3532	reg[4] = fp->cfg.regs[1]; /* interpolant info */
3533
3534	dummy.linear = FALSE;
3535	dummy.mask = 0xf; /* map all components of HPOS */
3536	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3537
3538	dummy.mask = 0x0;
3539
3540	if (vp->cfg.clpd < 0x40) {
3541		for (c = 0; c < vp->cfg.clpd_nr; ++c)
3542			map[m++] = vp->cfg.clpd + c;
3543		reg[1] = (m << 8);
3544	}
3545
3546	reg[0] |= m << 8; /* adjust BFC0 id */
3547
3548	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3549	if (nv50->rasterizer->pipe.light_twoside) {
3550		vpo = &vp->cfg.two_side[0];
3551
3552		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3553		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3554	}
3555
3556	reg[0] += m - 4; /* adjust FFC0 id */
3557	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3558
3559	for (i = 0; i < fp->cfg.io_nr; i++) {
3560		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id];
3561		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id];
3562
3563		/* position must be mapped first */
3564		assert(i == 0 || sn != TGSI_SEMANTIC_POSITION);
3565
3566		/* maybe even remove these from cfg.io */
3567		if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE)
3568			continue;
3569
3570		/* VP outputs and vp->cfg.io are in the same order */
3571		for (n = 0; n < vp->info.num_outputs; ++n) {
3572			if (vp->info.output_semantic_name[n] == sn &&
3573			    vp->info.output_semantic_index[n] == si)
3574				break;
3575		}
3576		vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy;
3577
3578		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3579	}
3580
3581	if (nv50->rasterizer->pipe.point_size_per_vertex) {
3582		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3583		reg[3] = (m++ << 4) | 1;
3584	}
3585
3586	/* now fill the stateobj */
3587	so = so_new(64, 0);
3588
3589	n = (m + 3) / 4;
3590	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3591	so_data  (so, m);
3592	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3593	so_datap (so, map, n);
3594
3595	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3596	so_datap (so, reg, 4);
3597
3598	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3599	so_data  (so, reg[4]);
3600
3601	so_method(so, tesla, 0x1540, 4);
3602	so_datap (so, lin, 4);
3603
3604	if (nv50->rasterizer->pipe.point_sprite) {
3605		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3606
3607		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3608		so_datap (so, pcrd, 8);
3609	}
3610
3611        so_ref(so, &nv50->state.programs);
3612        so_ref(NULL, &so);
3613}
3614
3615void
3616nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3617{
3618	while (p->exec_head) {
3619		struct nv50_program_exec *e = p->exec_head;
3620
3621		p->exec_head = e->next;
3622		FREE(e);
3623	}
3624	p->exec_tail = NULL;
3625	p->exec_size = 0;
3626
3627	nouveau_bo_ref(NULL, &p->bo);
3628
3629	nouveau_resource_free(&p->data[0]);
3630
3631	p->translated = 0;
3632}
3633