nv50_program.c revision 496c9eaacfabc4df4e6fb5ba230e60dc660554c8
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 127
35#define NV50_SU_MAX_ADDR 4
36//#define NV50_PROGRAM_DUMP
37
38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
39
40/* ARL - gallium craps itself on progs/vp/arl.txt
41 *
42 * MSB - Like MAD, but MUL+SUB
43 * 	- Fuck it off, introduce a way to negate args for ops that
44 * 	  support it.
45 *
46 * Look into inlining IMMD for ops other than MOV (make it general?)
47 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
48 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
49 *
50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
51 * case, if the emit_src() causes the inst to suddenly become long.
52 *
53 * Verify half-insns work where expected - and force disable them where they
54 * don't work - MUL has it forcibly disabled atm as it fixes POW..
55 *
56 * FUCK! watch dst==src vectors, can overwrite components that are needed.
57 * 	ie. SUB R0, R0.yzxw, R0
58 *
59 * Things to check with renouveau:
60 * 	FP attr/result assignment - how?
61 * 		attrib
62 * 			- 0x16bc maps vp output onto fp hpos
63 * 			- 0x16c0 maps vp output onto fp col0
64 * 		result
65 * 			- colr always 0-3
66 * 			- depr always 4
67 * 0x16bc->0x16e8 --> some binding between vp/fp regs
68 * 0x16b8 --> VP output count
69 *
70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
71 * 	      "MOV rcol.x, fcol.y" = 0x00000004
72 * 0x19a8 --> as above but 0x00000100 and 0x00000000
73 * 	- 0x00100000 used when KIL used
74 * 0x196c --> as above but 0x00000011 and 0x00000000
75 *
76 * 0x1988 --> 0xXXNNNNNN
77 * 	- XX == FP high something
78 */
79struct nv50_reg {
80	enum {
81		P_TEMP,
82		P_ATTR,
83		P_RESULT,
84		P_CONST,
85		P_IMMD,
86		P_ADDR
87	} type;
88	int index;
89
90	int hw;
91	int neg;
92
93	int rhw; /* result hw for FP outputs, or interpolant index */
94	int acc; /* instruction where this reg is last read (first insn == 1) */
95};
96
97/* arbitrary limits */
98#define MAX_IF_DEPTH 4
99#define MAX_LOOP_DEPTH 4
100
101struct nv50_pc {
102	struct nv50_program *p;
103
104	/* hw resources */
105	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
106	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
107
108	/* tgsi resources */
109	struct nv50_reg *temp;
110	int temp_nr;
111	struct nv50_reg *attr;
112	int attr_nr;
113	struct nv50_reg *result;
114	int result_nr;
115	struct nv50_reg *param;
116	int param_nr;
117	struct nv50_reg *immd;
118	float *immd_buf;
119	int immd_nr;
120	struct nv50_reg **addr;
121	int addr_nr;
122
123	struct nv50_reg *temp_temp[16];
124	unsigned temp_temp_nr;
125
126	/* broadcast and destination replacement regs */
127	struct nv50_reg *r_brdc;
128	struct nv50_reg *r_dst[4];
129
130	unsigned interp_mode[32];
131	/* perspective interpolation registers */
132	struct nv50_reg *iv_p;
133	struct nv50_reg *iv_c;
134
135	struct nv50_program_exec *if_cond;
136	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
137	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
138	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
139	int if_lvl, loop_lvl;
140	unsigned loop_pos[MAX_LOOP_DEPTH];
141
142	/* current instruction and total number of insns */
143	unsigned insn_cur;
144	unsigned insn_nr;
145
146	boolean allow32;
147};
148
149static INLINE void
150ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
151{
152	reg->type = type;
153	reg->index = index;
154	reg->hw = hw;
155	reg->neg = 0;
156	reg->rhw = -1;
157	reg->acc = 0;
158}
159
160static INLINE unsigned
161popcnt4(uint32_t val)
162{
163	static const unsigned cnt[16]
164	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
165	return cnt[val & 0xf];
166}
167
168static void
169terminate_mbb(struct nv50_pc *pc)
170{
171	int i;
172
173	/* remove records of temporary address register values */
174	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
175		if (pc->r_addr[i].index < 0)
176			pc->r_addr[i].rhw = -1;
177}
178
179static void
180alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
181{
182	int i = 0;
183
184	if (reg->type == P_RESULT) {
185		if (pc->p->cfg.high_result < (reg->hw + 1))
186			pc->p->cfg.high_result = reg->hw + 1;
187	}
188
189	if (reg->type != P_TEMP)
190		return;
191
192	if (reg->hw >= 0) {
193		/*XXX: do this here too to catch FP temp-as-attr usage..
194		 *     not clean, but works */
195		if (pc->p->cfg.high_temp < (reg->hw + 1))
196			pc->p->cfg.high_temp = reg->hw + 1;
197		return;
198	}
199
200	if (reg->rhw != -1) {
201		/* try to allocate temporary with index rhw first */
202		if (!(pc->r_temp[reg->rhw])) {
203			pc->r_temp[reg->rhw] = reg;
204			reg->hw = reg->rhw;
205			if (pc->p->cfg.high_temp < (reg->rhw + 1))
206				pc->p->cfg.high_temp = reg->rhw + 1;
207			return;
208		}
209		/* make sure we don't get things like $r0 needs to go
210		 * in $r1 and $r1 in $r0
211		 */
212		i = pc->result_nr * 4;
213	}
214
215	for (; i < NV50_SU_MAX_TEMP; i++) {
216		if (!(pc->r_temp[i])) {
217			pc->r_temp[i] = reg;
218			reg->hw = i;
219			if (pc->p->cfg.high_temp < (i + 1))
220				pc->p->cfg.high_temp = i + 1;
221			return;
222		}
223	}
224
225	assert(0);
226}
227
228/* XXX: For shaders that aren't executed linearly (e.g. shaders that
229 * contain loops), we need to assign all hw regs to TGSI TEMPs early,
230 * lest we risk temp_temps overwriting regs alloc'd "later".
231 */
232static struct nv50_reg *
233alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
234{
235	struct nv50_reg *r;
236	int i;
237
238	if (dst && dst->type == P_TEMP && dst->hw == -1)
239		return dst;
240
241	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
242		if (!pc->r_temp[i]) {
243			r = MALLOC_STRUCT(nv50_reg);
244			ctor_reg(r, P_TEMP, -1, i);
245			pc->r_temp[i] = r;
246			return r;
247		}
248	}
249
250	assert(0);
251	return NULL;
252}
253
254/* Assign the hw of the discarded temporary register src
255 * to the tgsi register dst and free src.
256 */
257static void
258assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
259{
260	assert(src->index == -1 && src->hw != -1);
261
262	if (dst->hw != -1)
263		pc->r_temp[dst->hw] = NULL;
264	pc->r_temp[src->hw] = dst;
265	dst->hw = src->hw;
266
267	FREE(src);
268}
269
270/* release the hardware resource held by r */
271static void
272release_hw(struct nv50_pc *pc, struct nv50_reg *r)
273{
274	assert(r->type == P_TEMP);
275	if (r->hw == -1)
276		return;
277
278	assert(pc->r_temp[r->hw] == r);
279	pc->r_temp[r->hw] = NULL;
280
281	r->acc = 0;
282	if (r->index == -1)
283		FREE(r);
284}
285
286static void
287free_temp(struct nv50_pc *pc, struct nv50_reg *r)
288{
289	if (r->index == -1) {
290		unsigned hw = r->hw;
291
292		FREE(pc->r_temp[hw]);
293		pc->r_temp[hw] = NULL;
294	}
295}
296
297static int
298alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
299{
300	int i;
301
302	if ((idx + 4) >= NV50_SU_MAX_TEMP)
303		return 1;
304
305	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
306	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
307		return alloc_temp4(pc, dst, idx + 4);
308
309	for (i = 0; i < 4; i++) {
310		dst[i] = MALLOC_STRUCT(nv50_reg);
311		ctor_reg(dst[i], P_TEMP, -1, idx + i);
312		pc->r_temp[idx + i] = dst[i];
313	}
314
315	return 0;
316}
317
318static void
319free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
320{
321	int i;
322
323	for (i = 0; i < 4; i++)
324		free_temp(pc, reg[i]);
325}
326
327static struct nv50_reg *
328temp_temp(struct nv50_pc *pc)
329{
330	if (pc->temp_temp_nr >= 16)
331		assert(0);
332
333	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
334	return pc->temp_temp[pc->temp_temp_nr++];
335}
336
337static void
338kill_temp_temp(struct nv50_pc *pc)
339{
340	int i;
341
342	for (i = 0; i < pc->temp_temp_nr; i++)
343		free_temp(pc, pc->temp_temp[i]);
344	pc->temp_temp_nr = 0;
345}
346
347static int
348ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
349{
350	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
351			       (pc->immd_nr + 1) * 4 * sizeof(float));
352	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
353	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
354	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
355	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
356
357	return pc->immd_nr++;
358}
359
360static struct nv50_reg *
361alloc_immd(struct nv50_pc *pc, float f)
362{
363	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
364	unsigned hw;
365
366	for (hw = 0; hw < pc->immd_nr * 4; hw++)
367		if (pc->immd_buf[hw] == f)
368			break;
369
370	if (hw == pc->immd_nr * 4)
371		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
372
373	ctor_reg(r, P_IMMD, -1, hw);
374	return r;
375}
376
377static struct nv50_program_exec *
378exec(struct nv50_pc *pc)
379{
380	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
381
382	e->param.index = -1;
383	return e;
384}
385
386static void
387emit(struct nv50_pc *pc, struct nv50_program_exec *e)
388{
389	struct nv50_program *p = pc->p;
390
391	if (p->exec_tail)
392		p->exec_tail->next = e;
393	if (!p->exec_head)
394		p->exec_head = e;
395	p->exec_tail = e;
396	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
397}
398
399static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
400
401static boolean
402is_long(struct nv50_program_exec *e)
403{
404	if (e->inst[0] & 1)
405		return TRUE;
406	return FALSE;
407}
408
409static boolean
410is_immd(struct nv50_program_exec *e)
411{
412	if (is_long(e) && (e->inst[1] & 3) == 3)
413		return TRUE;
414	return FALSE;
415}
416
417static INLINE void
418set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
419	 struct nv50_program_exec *e)
420{
421	set_long(pc, e);
422	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
423	e->inst[1] |= (pred << 7) | (idx << 12);
424}
425
426static INLINE void
427set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
428	    struct nv50_program_exec *e)
429{
430	set_long(pc, e);
431	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
432	e->inst[1] |= (idx << 4) | (on << 6);
433}
434
435static INLINE void
436set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
437{
438	if (is_long(e))
439		return;
440
441	e->inst[0] |= 1;
442	set_pred(pc, 0xf, 0, e);
443	set_pred_wr(pc, 0, 0, e);
444}
445
446static INLINE void
447set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
448{
449	if (dst->type == P_RESULT) {
450		set_long(pc, e);
451		e->inst[1] |= 0x00000008;
452	}
453
454	alloc_reg(pc, dst);
455	if (dst->hw > 63)
456		set_long(pc, e);
457	e->inst[0] |= (dst->hw << 2);
458}
459
460static INLINE void
461set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
462{
463	float f = pc->immd_buf[imm->hw];
464	unsigned val = fui(imm->neg ? -f : f);
465
466	set_long(pc, e);
467	/*XXX: can't be predicated - bits overlap.. catch cases where both
468	 *     are required and avoid them. */
469	set_pred(pc, 0, 0, e);
470	set_pred_wr(pc, 0, 0, e);
471
472	e->inst[1] |= 0x00000002 | 0x00000001;
473	e->inst[0] |= (val & 0x3f) << 16;
474	e->inst[1] |= (val >> 6) << 2;
475}
476
477static INLINE void
478set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
479{
480	assert(!(e->inst[0] & 0x0c000000));
481	assert(!(e->inst[1] & 0x00000004));
482
483	e->inst[0] |= (a->hw & 3) << 26;
484	e->inst[1] |= (a->hw >> 2) << 2;
485}
486
487static void
488emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
489		  struct nv50_reg *src0, uint16_t src1_val)
490{
491	struct nv50_program_exec *e = exec(pc);
492
493	e->inst[0] = 0xd0000000 | (src1_val << 9);
494	e->inst[1] = 0x20000000;
495	set_long(pc, e);
496	e->inst[0] |= dst->hw << 2;
497	if (src0) /* otherwise will add to $a0, which is always 0 */
498		set_addr(e, src0);
499
500	emit(pc, e);
501}
502
503static struct nv50_reg *
504alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
505{
506	int i;
507	struct nv50_reg *a_tgsi = NULL, *a = NULL;
508
509	if (!ref) {
510		/* allocate for TGSI address reg */
511		for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
512			if (pc->r_addr[i].index >= 0)
513				continue;
514			if (pc->r_addr[i].rhw >= 0 &&
515			    pc->r_addr[i].acc == pc->insn_cur)
516				continue;
517
518			pc->r_addr[i].rhw = -1;
519			pc->r_addr[i].index = i;
520			return &pc->r_addr[i];
521		}
522		assert(0);
523		return NULL;
524	}
525
526	/* Allocate and set an address reg so we can access 'ref'.
527	 *
528	 * If and r_addr has index < 0, it is not reserved for TGSI,
529	 * and index will be the negative of the TGSI addr index the
530	 * value in rhw is relative to, or -256 if rhw is an offset
531	 * from 0. If rhw < 0, the reg has not been initialized.
532	 */
533	for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
534		if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
535			continue;
536		if (pc->r_addr[i].rhw < 0) { /* unused */
537			a = &pc->r_addr[i];
538			continue;
539		}
540		if (!a && pc->r_addr[i].acc != pc->insn_cur)
541			a = &pc->r_addr[i];
542
543		if (ref->hw - pc->r_addr[i].rhw >= 128)
544			continue;
545
546		if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
547		    (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
548			pc->r_addr[i].acc = pc->insn_cur;
549			return &pc->r_addr[i];
550		}
551	}
552	assert(a);
553
554	if (ref->acc < 0)
555		a_tgsi = pc->addr[ref->index];
556
557	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
558
559	a->rhw = ref->hw & ~0x7f;
560	a->acc = pc->insn_cur;
561	a->index = a_tgsi ? -ref->index : -256;
562	return a;
563}
564
565#define INTERP_LINEAR		0
566#define INTERP_FLAT		1
567#define INTERP_PERSPECTIVE	2
568#define INTERP_CENTROID		4
569
570/* interpolant index has been stored in dst->rhw */
571static void
572emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
573		unsigned mode)
574{
575	assert(dst->rhw != -1);
576	struct nv50_program_exec *e = exec(pc);
577
578	e->inst[0] |= 0x80000000;
579	set_dst(pc, dst, e);
580	e->inst[0] |= (dst->rhw << 16);
581
582	if (mode & INTERP_FLAT) {
583		e->inst[0] |= (1 << 8);
584	} else {
585		if (mode & INTERP_PERSPECTIVE) {
586			e->inst[0] |= (1 << 25);
587			alloc_reg(pc, iv);
588			e->inst[0] |= (iv->hw << 9);
589		}
590
591		if (mode & INTERP_CENTROID)
592			e->inst[0] |= (1 << 24);
593	}
594
595	emit(pc, e);
596}
597
598static void
599set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
600	 struct nv50_program_exec *e)
601{
602	set_long(pc, e);
603
604	e->param.index = src->hw & 127;
605	e->param.shift = s;
606	e->param.mask = m << (s % 32);
607
608	if (src->hw > 127)
609		set_addr(e, alloc_addr(pc, src));
610	else
611	if (src->acc < 0) {
612		assert(src->type == P_CONST);
613		set_addr(e, pc->addr[src->index]);
614	}
615
616	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
617}
618
619static void
620emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
621{
622	struct nv50_program_exec *e = exec(pc);
623
624	e->inst[0] = 0x10000000;
625	if (!pc->allow32)
626		set_long(pc, e);
627
628	set_dst(pc, dst, e);
629
630	if (!is_long(e) && src->type == P_IMMD) {
631		set_immd(pc, src, e);
632		/*XXX: 32-bit, but steals part of "half" reg space - need to
633		 *     catch and handle this case if/when we do half-regs
634		 */
635	} else
636	if (src->type == P_IMMD || src->type == P_CONST) {
637		set_long(pc, e);
638		set_data(pc, src, 0x7f, 9, e);
639		e->inst[1] |= 0x20000000; /* src0 const? */
640	} else {
641		if (src->type == P_ATTR) {
642			set_long(pc, e);
643			e->inst[1] |= 0x00200000;
644		}
645
646		alloc_reg(pc, src);
647		if (src->hw > 63)
648			set_long(pc, e);
649		e->inst[0] |= (src->hw << 9);
650	}
651
652	if (is_long(e) && !is_immd(e)) {
653		e->inst[1] |= 0x04000000; /* 32-bit */
654		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
655		if (!(e->inst[1] & 0x20000000))
656			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
657	} else
658		e->inst[0] |= 0x00008000;
659
660	emit(pc, e);
661}
662
663static INLINE void
664emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
665{
666	struct nv50_reg *imm = alloc_immd(pc, f);
667	emit_mov(pc, dst, imm);
668	FREE(imm);
669}
670
671static boolean
672check_swap_src_0_1(struct nv50_pc *pc,
673		   struct nv50_reg **s0, struct nv50_reg **s1)
674{
675	struct nv50_reg *src0 = *s0, *src1 = *s1;
676
677	if (src0->type == P_CONST) {
678		if (src1->type != P_CONST) {
679			*s0 = src1;
680			*s1 = src0;
681			return TRUE;
682		}
683	} else
684	if (src1->type == P_ATTR) {
685		if (src0->type != P_ATTR) {
686			*s0 = src1;
687			*s1 = src0;
688			return TRUE;
689		}
690	}
691
692	return FALSE;
693}
694
695static void
696set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
697		     struct nv50_program_exec *e)
698{
699	struct nv50_reg *temp;
700
701	if (src->type != P_TEMP) {
702		temp = temp_temp(pc);
703		emit_mov(pc, temp, src);
704		src = temp;
705	}
706
707	alloc_reg(pc, src);
708	if (src->hw > 63)
709		set_long(pc, e);
710	e->inst[0] |= (src->hw << 9);
711}
712
713static void
714set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
715{
716	if (src->type == P_ATTR) {
717		set_long(pc, e);
718		e->inst[1] |= 0x00200000;
719	} else
720	if (src->type == P_CONST || src->type == P_IMMD) {
721		struct nv50_reg *temp = temp_temp(pc);
722
723		emit_mov(pc, temp, src);
724		src = temp;
725	}
726
727	alloc_reg(pc, src);
728	if (src->hw > 63)
729		set_long(pc, e);
730	e->inst[0] |= (src->hw << 9);
731}
732
733static void
734set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
735{
736	if (src->type == P_ATTR) {
737		struct nv50_reg *temp = temp_temp(pc);
738
739		emit_mov(pc, temp, src);
740		src = temp;
741	} else
742	if (src->type == P_CONST || src->type == P_IMMD) {
743		assert(!(e->inst[0] & 0x00800000));
744		if (e->inst[0] & 0x01000000) {
745			struct nv50_reg *temp = temp_temp(pc);
746
747			emit_mov(pc, temp, src);
748			src = temp;
749		} else {
750			set_data(pc, src, 0x7f, 16, e);
751			e->inst[0] |= 0x00800000;
752		}
753	}
754
755	alloc_reg(pc, src);
756	if (src->hw > 63)
757		set_long(pc, e);
758	e->inst[0] |= ((src->hw & 127) << 16);
759}
760
761static void
762set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
763{
764	set_long(pc, e);
765
766	if (src->type == P_ATTR) {
767		struct nv50_reg *temp = temp_temp(pc);
768
769		emit_mov(pc, temp, src);
770		src = temp;
771	} else
772	if (src->type == P_CONST || src->type == P_IMMD) {
773		assert(!(e->inst[0] & 0x01000000));
774		if (e->inst[0] & 0x00800000) {
775			struct nv50_reg *temp = temp_temp(pc);
776
777			emit_mov(pc, temp, src);
778			src = temp;
779		} else {
780			set_data(pc, src, 0x7f, 32+14, e);
781			e->inst[0] |= 0x01000000;
782		}
783	}
784
785	alloc_reg(pc, src);
786	e->inst[1] |= ((src->hw & 127) << 14);
787}
788
789static void
790emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
791	 struct nv50_reg *src1)
792{
793	struct nv50_program_exec *e = exec(pc);
794
795	e->inst[0] |= 0xc0000000;
796
797	if (!pc->allow32)
798		set_long(pc, e);
799
800	check_swap_src_0_1(pc, &src0, &src1);
801	set_dst(pc, dst, e);
802	set_src_0(pc, src0, e);
803	if (src1->type == P_IMMD && !is_long(e)) {
804		if (src0->neg)
805			e->inst[0] |= 0x00008000;
806		set_immd(pc, src1, e);
807	} else {
808		set_src_1(pc, src1, e);
809		if (src0->neg ^ src1->neg) {
810			if (is_long(e))
811				e->inst[1] |= 0x08000000;
812			else
813				e->inst[0] |= 0x00008000;
814		}
815	}
816
817	emit(pc, e);
818}
819
820static void
821emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
822	 struct nv50_reg *src0, struct nv50_reg *src1)
823{
824	struct nv50_program_exec *e = exec(pc);
825
826	e->inst[0] = 0xb0000000;
827
828	alloc_reg(pc, src1);
829	check_swap_src_0_1(pc, &src0, &src1);
830
831	if (!pc->allow32 || (src0->neg | src1->neg) || src1->hw > 63) {
832		set_long(pc, e);
833		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
834	}
835
836	set_dst(pc, dst, e);
837	set_src_0(pc, src0, e);
838	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
839		set_src_2(pc, src1, e);
840	else
841	if (src1->type == P_IMMD)
842		set_immd(pc, src1, e);
843	else
844		set_src_1(pc, src1, e);
845
846	emit(pc, e);
847}
848
849static void
850emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
851	 uint8_t s)
852{
853	struct nv50_program_exec *e = exec(pc);
854
855	set_long(pc, e);
856	e->inst[1] |= 0xc0000000;
857
858	e->inst[0] |= dst->hw << 2;
859	e->inst[0] |= s << 16; /* shift left */
860	set_src_0_restricted(pc, src, e);
861
862	emit(pc, e);
863}
864
865static void
866emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
867	    struct nv50_reg *src0, struct nv50_reg *src1)
868{
869	struct nv50_program_exec *e = exec(pc);
870
871	set_long(pc, e);
872	e->inst[0] |= 0xb0000000;
873	e->inst[1] |= (sub << 29);
874
875	check_swap_src_0_1(pc, &src0, &src1);
876	set_dst(pc, dst, e);
877	set_src_0(pc, src0, e);
878	set_src_1(pc, src1, e);
879
880	emit(pc, e);
881}
882
883static INLINE void
884emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
885	 struct nv50_reg *src1)
886{
887	assert(src0 != src1);
888	src1->neg ^= 1;
889	emit_add(pc, dst, src0, src1);
890	src1->neg ^= 1;
891}
892
893static void
894emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
895	    struct nv50_reg *src1, unsigned op)
896{
897	struct nv50_program_exec *e = exec(pc);
898
899	e->inst[0] = 0xd0000000;
900	set_long(pc, e);
901
902	check_swap_src_0_1(pc, &src0, &src1);
903	set_dst(pc, dst, e);
904	set_src_0(pc, src0, e);
905
906	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
907	    op != TGSI_OPCODE_XOR)
908		assert(!"invalid bit op");
909
910	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
911		set_immd(pc, src1, e);
912		if (op == TGSI_OPCODE_OR)
913			e->inst[0] |= 0x0100;
914		else
915		if (op == TGSI_OPCODE_XOR)
916			e->inst[0] |= 0x8000;
917	} else {
918		set_src_1(pc, src1, e);
919		e->inst[1] |= 0x04000000; /* 32 bit */
920		if (op == TGSI_OPCODE_OR)
921			e->inst[1] |= 0x4000;
922		else
923		if (op == TGSI_OPCODE_XOR)
924			e->inst[1] |= 0x8000;
925	}
926
927	emit(pc, e);
928}
929
930static void
931emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
932	 struct nv50_reg *src1, struct nv50_reg *src2)
933{
934	struct nv50_program_exec *e = exec(pc);
935
936	e->inst[0] |= 0xe0000000;
937
938	check_swap_src_0_1(pc, &src0, &src1);
939	set_dst(pc, dst, e);
940	set_src_0(pc, src0, e);
941	set_src_1(pc, src1, e);
942	set_src_2(pc, src2, e);
943
944	if (src0->neg ^ src1->neg)
945		e->inst[1] |= 0x04000000;
946	if (src2->neg)
947		e->inst[1] |= 0x08000000;
948
949	emit(pc, e);
950}
951
952static INLINE void
953emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
954	 struct nv50_reg *src1, struct nv50_reg *src2)
955{
956	assert(src2 != src0 && src2 != src1);
957	src2->neg ^= 1;
958	emit_mad(pc, dst, src0, src1, src2);
959	src2->neg ^= 1;
960}
961
962static void
963emit_flop(struct nv50_pc *pc, unsigned sub,
964	  struct nv50_reg *dst, struct nv50_reg *src)
965{
966	struct nv50_program_exec *e = exec(pc);
967
968	e->inst[0] |= 0x90000000;
969	if (sub) {
970		set_long(pc, e);
971		e->inst[1] |= (sub << 29);
972	}
973
974	set_dst(pc, dst, e);
975
976	if (sub == 0 || sub == 2)
977		set_src_0_restricted(pc, src, e);
978	else
979		set_src_0(pc, src, e);
980
981	emit(pc, e);
982}
983
984static void
985emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
986{
987	struct nv50_program_exec *e = exec(pc);
988
989	e->inst[0] |= 0xb0000000;
990
991	set_dst(pc, dst, e);
992	set_src_0(pc, src, e);
993	set_long(pc, e);
994	e->inst[1] |= (6 << 29) | 0x00004000;
995
996	emit(pc, e);
997}
998
999static void
1000emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1001{
1002	struct nv50_program_exec *e = exec(pc);
1003
1004	e->inst[0] |= 0xb0000000;
1005
1006	set_dst(pc, dst, e);
1007	set_src_0(pc, src, e);
1008	set_long(pc, e);
1009	e->inst[1] |= (6 << 29);
1010
1011	emit(pc, e);
1012}
1013
1014#define CVTOP_RN	0x01
1015#define CVTOP_FLOOR	0x03
1016#define CVTOP_CEIL	0x05
1017#define CVTOP_TRUNC	0x07
1018#define CVTOP_SAT	0x08
1019#define CVTOP_ABS	0x10
1020
1021/* 0x04 == 32 bit dst */
1022/* 0x40 == dst is float */
1023/* 0x80 == src is float */
1024#define CVT_F32_F32 0xc4
1025#define CVT_F32_S32 0x44
1026#define CVT_S32_F32 0x8c
1027#define CVT_S32_S32 0x0c
1028#define CVT_NEG     0x20
1029#define CVT_RI      0x08
1030
1031static void
1032emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
1033	 int wp, unsigned cvn, unsigned fmt)
1034{
1035	struct nv50_program_exec *e;
1036
1037	e = exec(pc);
1038	set_long(pc, e);
1039
1040	e->inst[0] |= 0xa0000000;
1041	e->inst[1] |= 0x00004000; /* 32 bit src */
1042	e->inst[1] |= (cvn << 16);
1043	e->inst[1] |= (fmt << 24);
1044	set_src_0(pc, src, e);
1045
1046	if (wp >= 0)
1047		set_pred_wr(pc, 1, wp, e);
1048
1049	if (dst)
1050		set_dst(pc, dst, e);
1051	else {
1052		e->inst[0] |= 0x000001fc;
1053		e->inst[1] |= 0x00000008;
1054	}
1055
1056	emit(pc, e);
1057}
1058
1059/* nv50 Condition codes:
1060 *  0x1 = LT
1061 *  0x2 = EQ
1062 *  0x3 = LE
1063 *  0x4 = GT
1064 *  0x5 = NE
1065 *  0x6 = GE
1066 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
1067 *  0x8 = unordered bit (allows NaN)
1068 */
1069static void
1070emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
1071	 struct nv50_reg *src0, struct nv50_reg *src1)
1072{
1073	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
1074
1075	struct nv50_program_exec *e = exec(pc);
1076	struct nv50_reg *rdst;
1077
1078	assert(ccode < 16);
1079	if (check_swap_src_0_1(pc, &src0, &src1))
1080		ccode = cc_swapped[ccode & 7] | (ccode & 8);
1081
1082	rdst = dst;
1083	if (dst && dst->type != P_TEMP)
1084		dst = alloc_temp(pc, NULL);
1085
1086	/* set.u32 */
1087	set_long(pc, e);
1088	e->inst[0] |= 0xb0000000;
1089	e->inst[1] |= 0x60000000 | (ccode << 14);
1090
1091	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
1092	 * that doesn't seem to match what the hw actually does
1093	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
1094	 */
1095
1096	if (wp >= 0)
1097		set_pred_wr(pc, 1, wp, e);
1098	if (dst)
1099		set_dst(pc, dst, e);
1100	else {
1101		e->inst[0] |= 0x000001fc;
1102		e->inst[1] |= 0x00000008;
1103	}
1104
1105	set_src_0(pc, src0, e);
1106	set_src_1(pc, src1, e);
1107
1108	emit(pc, e);
1109	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
1110
1111	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
1112	if (rdst)
1113		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
1114	if (rdst && rdst != dst)
1115		free_temp(pc, dst);
1116}
1117
1118static INLINE unsigned
1119map_tgsi_setop_cc(unsigned op)
1120{
1121	switch (op) {
1122	case TGSI_OPCODE_SLT: return 0x1;
1123	case TGSI_OPCODE_SGE: return 0x6;
1124	case TGSI_OPCODE_SEQ: return 0x2;
1125	case TGSI_OPCODE_SGT: return 0x4;
1126	case TGSI_OPCODE_SLE: return 0x3;
1127	case TGSI_OPCODE_SNE: return 0xd;
1128	default:
1129		assert(0);
1130		return 0;
1131	}
1132}
1133
1134static INLINE void
1135emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1136{
1137	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
1138}
1139
1140static void
1141emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
1142	 struct nv50_reg *v, struct nv50_reg *e)
1143{
1144	struct nv50_reg *temp = alloc_temp(pc, NULL);
1145
1146	emit_flop(pc, 3, temp, v);
1147	emit_mul(pc, temp, temp, e);
1148	emit_preex2(pc, temp, temp);
1149	emit_flop(pc, 6, dst, temp);
1150
1151	free_temp(pc, temp);
1152}
1153
1154static INLINE void
1155emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1156{
1157	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
1158}
1159
1160static INLINE void
1161emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1162{
1163	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
1164}
1165
1166static void
1167emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1168	 struct nv50_reg **src)
1169{
1170	struct nv50_reg *one = alloc_immd(pc, 1.0);
1171	struct nv50_reg *zero = alloc_immd(pc, 0.0);
1172	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
1173	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
1174	struct nv50_reg *tmp[4];
1175	boolean allow32 = pc->allow32;
1176
1177	pc->allow32 = FALSE;
1178
1179	if (mask & (3 << 1)) {
1180		tmp[0] = alloc_temp(pc, NULL);
1181		emit_minmax(pc, 4, tmp[0], src[0], zero);
1182	}
1183
1184	if (mask & (1 << 2)) {
1185		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
1186
1187		tmp[1] = temp_temp(pc);
1188		emit_minmax(pc, 4, tmp[1], src[1], zero);
1189
1190		tmp[3] = temp_temp(pc);
1191		emit_minmax(pc, 4, tmp[3], src[3], neg128);
1192		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
1193
1194		emit_pow(pc, dst[2], tmp[1], tmp[3]);
1195		emit_mov(pc, dst[2], zero);
1196		set_pred(pc, 3, 0, pc->p->exec_tail);
1197	}
1198
1199	if (mask & (1 << 1))
1200		assimilate_temp(pc, dst[1], tmp[0]);
1201	else
1202	if (mask & (1 << 2))
1203		free_temp(pc, tmp[0]);
1204
1205	pc->allow32 = allow32;
1206
1207	/* do this last, in case src[i,j] == dst[0,3] */
1208	if (mask & (1 << 0))
1209		emit_mov(pc, dst[0], one);
1210
1211	if (mask & (1 << 3))
1212		emit_mov(pc, dst[3], one);
1213
1214	FREE(pos128);
1215	FREE(neg128);
1216	FREE(zero);
1217	FREE(one);
1218}
1219
1220static INLINE void
1221emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1222{
1223	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
1224}
1225
1226static void
1227emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1228{
1229	struct nv50_program_exec *e;
1230	const int r_pred = 1;
1231	unsigned cvn = CVT_F32_F32;
1232
1233	if (src->neg)
1234		cvn |= CVT_NEG;
1235	/* write predicate reg */
1236	emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
1237
1238	/* conditional discard */
1239	e = exec(pc);
1240	e->inst[0] = 0x00000002;
1241	set_long(pc, e);
1242	set_pred(pc, 0x1 /* LT */, r_pred, e);
1243	emit(pc, e);
1244}
1245
1246static void
1247emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1248	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1249{
1250	struct nv50_reg *temp, *t[4];
1251	struct nv50_program_exec *e;
1252
1253	unsigned c, mode, dim;
1254
1255	switch (type) {
1256	case TGSI_TEXTURE_1D:
1257		dim = 1;
1258		break;
1259	case TGSI_TEXTURE_UNKNOWN:
1260	case TGSI_TEXTURE_2D:
1261	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1262	case TGSI_TEXTURE_RECT:
1263		dim = 2;
1264		break;
1265	case TGSI_TEXTURE_3D:
1266	case TGSI_TEXTURE_CUBE:
1267	case TGSI_TEXTURE_SHADOW2D:
1268	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1269		dim = 3;
1270		break;
1271	default:
1272		assert(0);
1273		break;
1274	}
1275
1276	/* some cards need t[0]'s hw index to be a multiple of 4 */
1277	alloc_temp4(pc, t, 0);
1278
1279	if (proj) {
1280		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1281			mode = pc->interp_mode[src[0]->index];
1282
1283			t[3]->rhw = src[3]->rhw;
1284			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1285			emit_flop(pc, 0, t[3], t[3]);
1286
1287			for (c = 0; c < dim; c++) {
1288				t[c]->rhw = src[c]->rhw;
1289				emit_interp(pc, t[c], t[3],
1290					    (mode | INTERP_PERSPECTIVE));
1291			}
1292		} else {
1293			emit_flop(pc, 0, t[3], src[3]);
1294			for (c = 0; c < dim; c++)
1295				emit_mul(pc, t[c], src[c], t[3]);
1296
1297			/* XXX: for some reason the blob sometimes uses MAD:
1298			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1299			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1300			 */
1301		}
1302	} else {
1303		if (type == TGSI_TEXTURE_CUBE) {
1304			temp = temp_temp(pc);
1305			emit_minmax(pc, 4, temp, src[0], src[1]);
1306			emit_minmax(pc, 4, temp, temp, src[2]);
1307			emit_flop(pc, 0, temp, temp);
1308			for (c = 0; c < 3; c++)
1309				emit_mul(pc, t[c], src[c], temp);
1310		} else {
1311			for (c = 0; c < dim; c++)
1312				emit_mov(pc, t[c], src[c]);
1313		}
1314	}
1315
1316	e = exec(pc);
1317	set_long(pc, e);
1318	e->inst[0] |= 0xf0000000;
1319	e->inst[1] |= 0x00000004;
1320	set_dst(pc, t[0], e);
1321	e->inst[0] |= (unit << 9);
1322
1323	if (dim == 2)
1324		e->inst[0] |= 0x00400000;
1325	else
1326	if (dim == 3)
1327		e->inst[0] |= 0x00800000;
1328
1329	e->inst[0] |= (mask & 0x3) << 25;
1330	e->inst[1] |= (mask & 0xc) << 12;
1331
1332	emit(pc, e);
1333
1334#if 1
1335	c = 0;
1336	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
1337	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
1338	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
1339	if (mask & 8) emit_mov(pc, dst[3], t[c]);
1340
1341	free_temp4(pc, t);
1342#else
1343	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1344	 * the texture coordinates, not the fetched values: latency ? */
1345
1346	for (c = 0; c < 4; c++) {
1347		if (mask & (1 << c))
1348			assimilate_temp(pc, dst[c], t[c]);
1349		else
1350			free_temp(pc, t[c]);
1351	}
1352#endif
1353}
1354
1355static void
1356emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1357	    struct nv50_program_exec **join)
1358{
1359	struct nv50_program_exec *e = exec(pc);
1360
1361	if (join) {
1362		set_long(pc, e);
1363		e->inst[0] |= 0xa0000002;
1364		emit(pc, e);
1365		*join = e;
1366		e = exec(pc);
1367	}
1368
1369	set_long(pc, e);
1370	e->inst[0] |= 0x10000002;
1371	if (pred >= 0)
1372		set_pred(pc, cc, pred, e);
1373	emit(pc, e);
1374}
1375
1376static void
1377emit_nop(struct nv50_pc *pc)
1378{
1379	struct nv50_program_exec *e = exec(pc);
1380
1381	e->inst[0] = 0xf0000000;
1382	set_long(pc, e);
1383	e->inst[1] = 0xe0000000;
1384	emit(pc, e);
1385}
1386
1387static void
1388emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1389{
1390	struct nv50_program_exec *e = exec(pc);
1391
1392	assert(src->type == P_TEMP);
1393
1394	e->inst[0] = 0xc0140000;
1395	e->inst[1] = 0x89800000;
1396	set_long(pc, e);
1397	set_dst(pc, dst, e);
1398	set_src_0(pc, src, e);
1399	set_src_2(pc, src, e);
1400
1401	emit(pc, e);
1402}
1403
1404static void
1405emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1406{
1407	struct nv50_program_exec *e = exec(pc);
1408
1409	assert(src->type == P_TEMP);
1410
1411	if (!src->neg) /* ! double negation */
1412		emit_neg(pc, src, src);
1413
1414	e->inst[0] = 0xc0150000;
1415	e->inst[1] = 0x8a400000;
1416	set_long(pc, e);
1417	set_dst(pc, dst, e);
1418	set_src_0(pc, src, e);
1419	set_src_2(pc, src, e);
1420
1421	emit(pc, e);
1422}
1423
1424static void
1425convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1426{
1427	unsigned q = 0, m = ~0;
1428
1429	assert(!is_long(e));
1430
1431	switch (e->inst[0] >> 28) {
1432	case 0x1:
1433		/* MOV */
1434		q = 0x0403c000;
1435		m = 0xffff7fff;
1436		break;
1437	case 0x8:
1438		/* INTERP (move centroid, perspective and flat bits) */
1439		m = ~0x03000100;
1440		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1441		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1442		break;
1443	case 0x9:
1444		/* RCP */
1445		break;
1446	case 0xB:
1447		/* ADD */
1448		m = ~(127 << 16);
1449		q = ((e->inst[0] & (~m)) >> 2);
1450		break;
1451	case 0xC:
1452		/* MUL */
1453		m = ~0x00008000;
1454		q = ((e->inst[0] & (~m)) << 12);
1455		break;
1456	case 0xE:
1457		/* MAD (if src2 == dst) */
1458		q = ((e->inst[0] & 0x1fc) << 12);
1459		break;
1460	default:
1461		assert(0);
1462		break;
1463	}
1464
1465	set_long(pc, e);
1466	pc->p->exec_size++;
1467
1468	e->inst[0] &= m;
1469	e->inst[1] |= q;
1470}
1471
1472/* Some operations support an optional negation flag. */
1473static boolean
1474negate_supported(const struct tgsi_full_instruction *insn, int i)
1475{
1476	int s;
1477
1478	switch (insn->Instruction.Opcode) {
1479	case TGSI_OPCODE_DDY:
1480	case TGSI_OPCODE_DP3:
1481	case TGSI_OPCODE_DP4:
1482	case TGSI_OPCODE_MUL:
1483	case TGSI_OPCODE_KIL:
1484	case TGSI_OPCODE_ADD:
1485	case TGSI_OPCODE_SUB:
1486	case TGSI_OPCODE_MAD:
1487		break;
1488	case TGSI_OPCODE_POW:
1489		if (i == 1)
1490			break;
1491		return FALSE;
1492	default:
1493		return FALSE;
1494	}
1495
1496	/* Watch out for possible multiple uses of an nv50_reg, we
1497	 * can't use nv50_reg::neg in these cases.
1498	 */
1499	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
1500		if (s == i)
1501			continue;
1502		if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
1503		     insn->FullSrcRegisters[i].SrcRegister.Index) &&
1504		    (insn->FullSrcRegisters[s].SrcRegister.File ==
1505		     insn->FullSrcRegisters[i].SrcRegister.File))
1506			return FALSE;
1507	}
1508
1509	return TRUE;
1510}
1511
1512/* Return a read mask for source registers deduced from opcode & write mask. */
1513static unsigned
1514nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1515{
1516	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1517
1518	switch (insn->Instruction.Opcode) {
1519	case TGSI_OPCODE_COS:
1520	case TGSI_OPCODE_SIN:
1521		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1522	case TGSI_OPCODE_DP3:
1523		return 0x7;
1524	case TGSI_OPCODE_DP4:
1525	case TGSI_OPCODE_DPH:
1526	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1527		return 0xf;
1528	case TGSI_OPCODE_DST:
1529		return mask & (c ? 0xa : 0x6);
1530	case TGSI_OPCODE_EX2:
1531	case TGSI_OPCODE_LG2:
1532	case TGSI_OPCODE_POW:
1533	case TGSI_OPCODE_RCP:
1534	case TGSI_OPCODE_RSQ:
1535	case TGSI_OPCODE_SCS:
1536		return 0x1;
1537	case TGSI_OPCODE_LIT:
1538		return 0xb;
1539	case TGSI_OPCODE_TEX:
1540	case TGSI_OPCODE_TXP:
1541	{
1542		const struct tgsi_instruction_ext_texture *tex;
1543
1544		assert(insn->Instruction.Extended);
1545		tex = &insn->InstructionExtTexture;
1546
1547		mask = 0x7;
1548		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1549			mask |= 0x8;
1550
1551		switch (tex->Texture) {
1552		case TGSI_TEXTURE_1D:
1553			mask &= 0x9;
1554			break;
1555		case TGSI_TEXTURE_2D:
1556			mask &= 0xb;
1557			break;
1558		default:
1559			break;
1560		}
1561	}
1562		return mask;
1563	case TGSI_OPCODE_XPD:
1564		x = 0;
1565		if (mask & 1) x |= 0x6;
1566		if (mask & 2) x |= 0x5;
1567		if (mask & 4) x |= 0x3;
1568		return x;
1569	default:
1570		break;
1571	}
1572
1573	return mask;
1574}
1575
1576static struct nv50_reg *
1577tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1578{
1579	switch (dst->DstRegister.File) {
1580	case TGSI_FILE_TEMPORARY:
1581		return &pc->temp[dst->DstRegister.Index * 4 + c];
1582	case TGSI_FILE_OUTPUT:
1583		return &pc->result[dst->DstRegister.Index * 4 + c];
1584	case TGSI_FILE_ADDRESS:
1585	{
1586		struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c];
1587		if (!r) {
1588			r = alloc_addr(pc, NULL);
1589			pc->addr[dst->DstRegister.Index * 4 + c] = r;
1590		}
1591		assert(r);
1592		return r;
1593	}
1594	case TGSI_FILE_NULL:
1595		return NULL;
1596	default:
1597		break;
1598	}
1599
1600	return NULL;
1601}
1602
1603static struct nv50_reg *
1604tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1605	 boolean neg)
1606{
1607	struct nv50_reg *r = NULL;
1608	struct nv50_reg *temp;
1609	unsigned sgn, c, swz;
1610
1611	if (src->SrcRegister.File != TGSI_FILE_CONSTANT)
1612		assert(!src->SrcRegister.Indirect);
1613
1614	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1615
1616	c = tgsi_util_get_full_src_register_swizzle(src, chan);
1617	switch (c) {
1618	case TGSI_SWIZZLE_X:
1619	case TGSI_SWIZZLE_Y:
1620	case TGSI_SWIZZLE_Z:
1621	case TGSI_SWIZZLE_W:
1622		switch (src->SrcRegister.File) {
1623		case TGSI_FILE_INPUT:
1624			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1625			break;
1626		case TGSI_FILE_TEMPORARY:
1627			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1628			break;
1629		case TGSI_FILE_CONSTANT:
1630			if (!src->SrcRegister.Indirect) {
1631				r = &pc->param[src->SrcRegister.Index * 4 + c];
1632				break;
1633			}
1634			/* Indicate indirection by setting r->acc < 0 and
1635			 * use the index field to select the address reg.
1636			 */
1637			r = MALLOC_STRUCT(nv50_reg);
1638			swz = tgsi_util_get_src_register_swizzle(
1639						 &src->SrcRegisterInd, 0);
1640			ctor_reg(r, P_CONST,
1641				 src->SrcRegisterInd.Index * 4 + swz,
1642				 src->SrcRegister.Index * 4 + c);
1643			r->acc = -1;
1644			break;
1645		case TGSI_FILE_IMMEDIATE:
1646			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1647			break;
1648		case TGSI_FILE_SAMPLER:
1649			break;
1650		case TGSI_FILE_ADDRESS:
1651			r = pc->addr[src->SrcRegister.Index * 4 + c];
1652			assert(r);
1653			break;
1654		default:
1655			assert(0);
1656			break;
1657		}
1658		break;
1659	default:
1660		assert(0);
1661		break;
1662	}
1663
1664	switch (sgn) {
1665	case TGSI_UTIL_SIGN_KEEP:
1666		break;
1667	case TGSI_UTIL_SIGN_CLEAR:
1668		temp = temp_temp(pc);
1669		emit_abs(pc, temp, r);
1670		r = temp;
1671		break;
1672	case TGSI_UTIL_SIGN_TOGGLE:
1673		if (neg)
1674			r->neg = 1;
1675		else {
1676			temp = temp_temp(pc);
1677			emit_neg(pc, temp, r);
1678			r = temp;
1679		}
1680		break;
1681	case TGSI_UTIL_SIGN_SET:
1682		temp = temp_temp(pc);
1683		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
1684		r = temp;
1685		break;
1686	default:
1687		assert(0);
1688		break;
1689	}
1690
1691	return r;
1692}
1693
1694/* return TRUE for ops that produce only a single result */
1695static boolean
1696is_scalar_op(unsigned op)
1697{
1698	switch (op) {
1699	case TGSI_OPCODE_COS:
1700	case TGSI_OPCODE_DP2:
1701	case TGSI_OPCODE_DP3:
1702	case TGSI_OPCODE_DP4:
1703	case TGSI_OPCODE_DPH:
1704	case TGSI_OPCODE_EX2:
1705	case TGSI_OPCODE_LG2:
1706	case TGSI_OPCODE_POW:
1707	case TGSI_OPCODE_RCP:
1708	case TGSI_OPCODE_RSQ:
1709	case TGSI_OPCODE_SIN:
1710		/*
1711	case TGSI_OPCODE_KIL:
1712	case TGSI_OPCODE_LIT:
1713	case TGSI_OPCODE_SCS:
1714		*/
1715		return TRUE;
1716	default:
1717		return FALSE;
1718	}
1719}
1720
1721/* Returns a bitmask indicating which dst components depend
1722 * on source s, component c (reverse of nv50_tgsi_src_mask).
1723 */
1724static unsigned
1725nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1726{
1727	if (is_scalar_op(op))
1728		return 0x1;
1729
1730	switch (op) {
1731	case TGSI_OPCODE_DST:
1732		return (1 << c) & (s ? 0xa : 0x6);
1733	case TGSI_OPCODE_XPD:
1734		switch (c) {
1735		case 0: return 0x6;
1736		case 1: return 0x5;
1737		case 2: return 0x3;
1738		case 3: return 0x0;
1739		default:
1740			assert(0);
1741			return 0x0;
1742		}
1743	case TGSI_OPCODE_LIT:
1744	case TGSI_OPCODE_SCS:
1745	case TGSI_OPCODE_TEX:
1746	case TGSI_OPCODE_TXP:
1747		/* these take care of dangerous swizzles themselves */
1748		return 0x0;
1749	case TGSI_OPCODE_IF:
1750	case TGSI_OPCODE_KIL:
1751		/* don't call this function for these ops */
1752		assert(0);
1753		return 0;
1754	default:
1755		/* linear vector instruction */
1756		return (1 << c);
1757	}
1758}
1759
1760static INLINE boolean
1761has_pred(struct nv50_program_exec *e, unsigned cc)
1762{
1763	if (!is_long(e) || is_immd(e))
1764		return FALSE;
1765	return ((e->inst[1] & 0x780) == (cc << 7));
1766}
1767
1768/* on ENDIF see if we can do "@p0.neu single_op" instead of:
1769 *        join_at ENDIF
1770 *        @p0.eq bra ENDIF
1771 *        single_op
1772 * ENDIF: nop.join
1773 */
1774static boolean
1775nv50_kill_branch(struct nv50_pc *pc)
1776{
1777	int lvl = pc->if_lvl;
1778
1779	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
1780		return FALSE;
1781
1782	/* if ccode == 'true', the BRA is from an ELSE and the predicate
1783	 * reg may no longer be valid, since we currently always use $p0
1784	 */
1785	if (has_pred(pc->if_insn[lvl], 0xf))
1786		return FALSE;
1787	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
1788
1789	/* We'll use the exec allocated for JOIN_AT (as we can't easily
1790	 * update prev's next); if exec_tail is BRK, update the pointer.
1791	 */
1792	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
1793		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
1794
1795	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
1796
1797	*pc->br_join[lvl] = *pc->p->exec_tail;
1798
1799	FREE(pc->if_insn[lvl]);
1800	FREE(pc->p->exec_tail);
1801
1802	pc->p->exec_tail = pc->br_join[lvl];
1803	pc->p->exec_tail->next = NULL;
1804	set_pred(pc, 0xd, 0, pc->p->exec_tail);
1805
1806	return TRUE;
1807}
1808
1809static boolean
1810nv50_program_tx_insn(struct nv50_pc *pc,
1811		     const struct tgsi_full_instruction *inst)
1812{
1813	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1814	unsigned mask, sat, unit;
1815	int i, c;
1816
1817	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1818	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1819
1820	memset(src, 0, sizeof(src));
1821
1822	for (c = 0; c < 4; c++) {
1823		if ((mask & (1 << c)) && !pc->r_dst[c])
1824			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1825		else
1826			dst[c] = pc->r_dst[c];
1827		rdst[c] = dst[c];
1828	}
1829
1830	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1831		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1832		unsigned src_mask;
1833		boolean neg_supp;
1834
1835		src_mask = nv50_tgsi_src_mask(inst, i);
1836		neg_supp = negate_supported(inst, i);
1837
1838		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1839			unit = fs->SrcRegister.Index;
1840
1841		for (c = 0; c < 4; c++)
1842			if (src_mask & (1 << c))
1843				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1844	}
1845
1846	brdc = temp = pc->r_brdc;
1847	if (brdc && brdc->type != P_TEMP) {
1848		temp = temp_temp(pc);
1849		if (sat)
1850			brdc = temp;
1851	} else
1852	if (sat) {
1853		for (c = 0; c < 4; c++) {
1854			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1855				continue;
1856			/* rdst[c] = dst[c]; */ /* done above */
1857			dst[c] = temp_temp(pc);
1858		}
1859	}
1860
1861	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1862
1863	switch (inst->Instruction.Opcode) {
1864	case TGSI_OPCODE_ABS:
1865		for (c = 0; c < 4; c++) {
1866			if (!(mask & (1 << c)))
1867				continue;
1868			emit_abs(pc, dst[c], src[0][c]);
1869		}
1870		break;
1871	case TGSI_OPCODE_ADD:
1872		for (c = 0; c < 4; c++) {
1873			if (!(mask & (1 << c)))
1874				continue;
1875			emit_add(pc, dst[c], src[0][c], src[1][c]);
1876		}
1877		break;
1878	case TGSI_OPCODE_AND:
1879	case TGSI_OPCODE_XOR:
1880	case TGSI_OPCODE_OR:
1881		for (c = 0; c < 4; c++) {
1882			if (!(mask & (1 << c)))
1883				continue;
1884			emit_bitop2(pc, dst[c], src[0][c], src[1][c],
1885				    inst->Instruction.Opcode);
1886		}
1887		break;
1888	case TGSI_OPCODE_ARL:
1889		assert(src[0][0]);
1890		temp = temp_temp(pc);
1891		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
1892		emit_arl(pc, dst[0], temp, 4);
1893		break;
1894	case TGSI_OPCODE_BGNLOOP:
1895		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
1896		terminate_mbb(pc);
1897		break;
1898	case TGSI_OPCODE_BRK:
1899		emit_branch(pc, -1, 0, NULL);
1900		assert(pc->loop_lvl > 0);
1901		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
1902		break;
1903	case TGSI_OPCODE_CEIL:
1904		for (c = 0; c < 4; c++) {
1905			if (!(mask & (1 << c)))
1906				continue;
1907			emit_cvt(pc, dst[c], src[0][c], -1,
1908				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
1909		}
1910		break;
1911	case TGSI_OPCODE_CMP:
1912		pc->allow32 = FALSE;
1913		for (c = 0; c < 4; c++) {
1914			if (!(mask & (1 << c)))
1915				continue;
1916			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
1917			emit_mov(pc, dst[c], src[1][c]);
1918			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
1919			emit_mov(pc, dst[c], src[2][c]);
1920			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
1921		}
1922		break;
1923	case TGSI_OPCODE_COS:
1924		if (mask & 8) {
1925			emit_precossin(pc, temp, src[0][3]);
1926			emit_flop(pc, 5, dst[3], temp);
1927			if (!(mask &= 7))
1928				break;
1929			if (temp == dst[3])
1930				temp = brdc = temp_temp(pc);
1931		}
1932		emit_precossin(pc, temp, src[0][0]);
1933		emit_flop(pc, 5, brdc, temp);
1934		break;
1935	case TGSI_OPCODE_DDX:
1936		for (c = 0; c < 4; c++) {
1937			if (!(mask & (1 << c)))
1938				continue;
1939			emit_ddx(pc, dst[c], src[0][c]);
1940		}
1941		break;
1942	case TGSI_OPCODE_DDY:
1943		for (c = 0; c < 4; c++) {
1944			if (!(mask & (1 << c)))
1945				continue;
1946			emit_ddy(pc, dst[c], src[0][c]);
1947		}
1948		break;
1949	case TGSI_OPCODE_DP3:
1950		emit_mul(pc, temp, src[0][0], src[1][0]);
1951		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1952		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1953		break;
1954	case TGSI_OPCODE_DP4:
1955		emit_mul(pc, temp, src[0][0], src[1][0]);
1956		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1957		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1958		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1959		break;
1960	case TGSI_OPCODE_DPH:
1961		emit_mul(pc, temp, src[0][0], src[1][0]);
1962		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1963		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1964		emit_add(pc, brdc, src[1][3], temp);
1965		break;
1966	case TGSI_OPCODE_DST:
1967		if (mask & (1 << 1))
1968			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1969		if (mask & (1 << 2))
1970			emit_mov(pc, dst[2], src[0][2]);
1971		if (mask & (1 << 3))
1972			emit_mov(pc, dst[3], src[1][3]);
1973		if (mask & (1 << 0))
1974			emit_mov_immdval(pc, dst[0], 1.0f);
1975		break;
1976	case TGSI_OPCODE_ELSE:
1977		emit_branch(pc, -1, 0, NULL);
1978		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1979		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1980		terminate_mbb(pc);
1981		break;
1982	case TGSI_OPCODE_ENDIF:
1983		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1984
1985		/* try to replace branch over 1 insn with a predicated insn */
1986		if (nv50_kill_branch(pc) == TRUE)
1987			break;
1988
1989		if (pc->br_join[pc->if_lvl]) {
1990			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
1991			pc->br_join[pc->if_lvl] = NULL;
1992		}
1993		terminate_mbb(pc);
1994		/* emit a NOP as join point, we could set it on the next
1995		 * one, but would have to make sure it is long and !immd
1996		 */
1997		emit_nop(pc);
1998		pc->p->exec_tail->inst[1] |= 2;
1999		break;
2000	case TGSI_OPCODE_ENDLOOP:
2001		emit_branch(pc, -1, 0, NULL);
2002		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
2003		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
2004		terminate_mbb(pc);
2005		break;
2006	case TGSI_OPCODE_EX2:
2007		emit_preex2(pc, temp, src[0][0]);
2008		emit_flop(pc, 6, brdc, temp);
2009		break;
2010	case TGSI_OPCODE_FLR:
2011		for (c = 0; c < 4; c++) {
2012			if (!(mask & (1 << c)))
2013				continue;
2014			emit_flr(pc, dst[c], src[0][c]);
2015		}
2016		break;
2017	case TGSI_OPCODE_FRC:
2018		temp = temp_temp(pc);
2019		for (c = 0; c < 4; c++) {
2020			if (!(mask & (1 << c)))
2021				continue;
2022			emit_flr(pc, temp, src[0][c]);
2023			emit_sub(pc, dst[c], src[0][c], temp);
2024		}
2025		break;
2026	case TGSI_OPCODE_IF:
2027		/* emitting a join_at may not be necessary */
2028		assert(pc->if_lvl < MAX_IF_DEPTH);
2029		/* set_pred_wr(pc, 1, 0, pc->if_cond); */
2030		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
2031			 CVT_F32_F32);
2032		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
2033		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
2034		terminate_mbb(pc);
2035		break;
2036	case TGSI_OPCODE_KIL:
2037		emit_kil(pc, src[0][0]);
2038		emit_kil(pc, src[0][1]);
2039		emit_kil(pc, src[0][2]);
2040		emit_kil(pc, src[0][3]);
2041		break;
2042	case TGSI_OPCODE_LIT:
2043		emit_lit(pc, &dst[0], mask, &src[0][0]);
2044		break;
2045	case TGSI_OPCODE_LG2:
2046		emit_flop(pc, 3, brdc, src[0][0]);
2047		break;
2048	case TGSI_OPCODE_LRP:
2049		temp = temp_temp(pc);
2050		for (c = 0; c < 4; c++) {
2051			if (!(mask & (1 << c)))
2052				continue;
2053			emit_sub(pc, temp, src[1][c], src[2][c]);
2054			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
2055		}
2056		break;
2057	case TGSI_OPCODE_MAD:
2058		for (c = 0; c < 4; c++) {
2059			if (!(mask & (1 << c)))
2060				continue;
2061			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
2062		}
2063		break;
2064	case TGSI_OPCODE_MAX:
2065		for (c = 0; c < 4; c++) {
2066			if (!(mask & (1 << c)))
2067				continue;
2068			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
2069		}
2070		break;
2071	case TGSI_OPCODE_MIN:
2072		for (c = 0; c < 4; c++) {
2073			if (!(mask & (1 << c)))
2074				continue;
2075			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
2076		}
2077		break;
2078	case TGSI_OPCODE_MOV:
2079		for (c = 0; c < 4; c++) {
2080			if (!(mask & (1 << c)))
2081				continue;
2082			emit_mov(pc, dst[c], src[0][c]);
2083		}
2084		break;
2085	case TGSI_OPCODE_MUL:
2086		for (c = 0; c < 4; c++) {
2087			if (!(mask & (1 << c)))
2088				continue;
2089			emit_mul(pc, dst[c], src[0][c], src[1][c]);
2090		}
2091		break;
2092	case TGSI_OPCODE_POW:
2093		emit_pow(pc, brdc, src[0][0], src[1][0]);
2094		break;
2095	case TGSI_OPCODE_RCP:
2096		emit_flop(pc, 0, brdc, src[0][0]);
2097		break;
2098	case TGSI_OPCODE_RSQ:
2099		emit_flop(pc, 2, brdc, src[0][0]);
2100		break;
2101	case TGSI_OPCODE_SCS:
2102		temp = temp_temp(pc);
2103		if (mask & 3)
2104			emit_precossin(pc, temp, src[0][0]);
2105		if (mask & (1 << 0))
2106			emit_flop(pc, 5, dst[0], temp);
2107		if (mask & (1 << 1))
2108			emit_flop(pc, 4, dst[1], temp);
2109		if (mask & (1 << 2))
2110			emit_mov_immdval(pc, dst[2], 0.0);
2111		if (mask & (1 << 3))
2112			emit_mov_immdval(pc, dst[3], 1.0);
2113		break;
2114	case TGSI_OPCODE_SIN:
2115		if (mask & 8) {
2116			emit_precossin(pc, temp, src[0][3]);
2117			emit_flop(pc, 4, dst[3], temp);
2118			if (!(mask &= 7))
2119				break;
2120			if (temp == dst[3])
2121				temp = brdc = temp_temp(pc);
2122		}
2123		emit_precossin(pc, temp, src[0][0]);
2124		emit_flop(pc, 4, brdc, temp);
2125		break;
2126	case TGSI_OPCODE_SLT:
2127	case TGSI_OPCODE_SGE:
2128	case TGSI_OPCODE_SEQ:
2129	case TGSI_OPCODE_SGT:
2130	case TGSI_OPCODE_SLE:
2131	case TGSI_OPCODE_SNE:
2132		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
2133		for (c = 0; c < 4; c++) {
2134			if (!(mask & (1 << c)))
2135				continue;
2136			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
2137		}
2138		break;
2139	case TGSI_OPCODE_SUB:
2140		for (c = 0; c < 4; c++) {
2141			if (!(mask & (1 << c)))
2142				continue;
2143			emit_sub(pc, dst[c], src[0][c], src[1][c]);
2144		}
2145		break;
2146	case TGSI_OPCODE_TEX:
2147		emit_tex(pc, dst, mask, src[0], unit,
2148			 inst->InstructionExtTexture.Texture, FALSE);
2149		break;
2150	case TGSI_OPCODE_TXP:
2151		emit_tex(pc, dst, mask, src[0], unit,
2152			 inst->InstructionExtTexture.Texture, TRUE);
2153		break;
2154	case TGSI_OPCODE_TRUNC:
2155		for (c = 0; c < 4; c++) {
2156			if (!(mask & (1 << c)))
2157				continue;
2158			emit_cvt(pc, dst[c], src[0][c], -1,
2159				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
2160		}
2161		break;
2162	case TGSI_OPCODE_XPD:
2163		temp = temp_temp(pc);
2164		if (mask & (1 << 0)) {
2165			emit_mul(pc, temp, src[0][2], src[1][1]);
2166			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
2167		}
2168		if (mask & (1 << 1)) {
2169			emit_mul(pc, temp, src[0][0], src[1][2]);
2170			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
2171		}
2172		if (mask & (1 << 2)) {
2173			emit_mul(pc, temp, src[0][1], src[1][0]);
2174			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
2175		}
2176		if (mask & (1 << 3))
2177			emit_mov_immdval(pc, dst[3], 1.0);
2178		break;
2179	case TGSI_OPCODE_END:
2180		break;
2181	default:
2182		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
2183		return FALSE;
2184	}
2185
2186	if (brdc) {
2187		if (sat)
2188			emit_sat(pc, brdc, brdc);
2189		for (c = 0; c < 4; c++)
2190			if ((mask & (1 << c)) && dst[c] != brdc)
2191				emit_mov(pc, dst[c], brdc);
2192	} else
2193	if (sat) {
2194		for (c = 0; c < 4; c++) {
2195			if (!(mask & (1 << c)))
2196				continue;
2197			/* In this case we saturate later, and dst[c] won't
2198			 * be another temp_temp (and thus lost), since rdst
2199			 * already is TEMP (see above). */
2200			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
2201				continue;
2202			emit_sat(pc, rdst[c], dst[c]);
2203		}
2204	}
2205
2206	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2207		for (c = 0; c < 4; c++) {
2208			if (!src[i][c])
2209				continue;
2210			src[i][c]->neg = 0;
2211			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
2212				FREE(src[i][c]);
2213			else
2214			if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
2215				FREE(src[i][c]); /* indirect constant */
2216		}
2217	}
2218
2219	kill_temp_temp(pc);
2220	return TRUE;
2221}
2222
2223static void
2224prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
2225{
2226	struct nv50_reg *reg = NULL;
2227	const struct tgsi_full_src_register *src;
2228	const struct tgsi_dst_register *dst;
2229	unsigned i, c, k, mask;
2230
2231	dst = &insn->FullDstRegisters[0].DstRegister;
2232	mask = dst->WriteMask;
2233
2234        if (dst->File == TGSI_FILE_TEMPORARY)
2235                reg = pc->temp;
2236        else
2237        if (dst->File == TGSI_FILE_OUTPUT)
2238                reg = pc->result;
2239
2240	if (reg) {
2241		for (c = 0; c < 4; c++) {
2242			if (!(mask & (1 << c)))
2243				continue;
2244			reg[dst->Index * 4 + c].acc = pc->insn_nr;
2245		}
2246	}
2247
2248	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2249		src = &insn->FullSrcRegisters[i];
2250
2251		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
2252			reg = pc->temp;
2253		else
2254		if (src->SrcRegister.File == TGSI_FILE_INPUT)
2255			reg = pc->attr;
2256		else
2257			continue;
2258
2259		mask = nv50_tgsi_src_mask(insn, i);
2260
2261		for (c = 0; c < 4; c++) {
2262			if (!(mask & (1 << c)))
2263				continue;
2264			k = tgsi_util_get_full_src_register_swizzle(src, c);
2265
2266			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
2267		}
2268	}
2269}
2270
2271/* Returns a bitmask indicating which dst components need to be
2272 * written to temporaries first to avoid 'corrupting' sources.
2273 *
2274 * m[i]   (out) indicate component to write in the i-th position
2275 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
2276 */
2277static unsigned
2278nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
2279{
2280	unsigned i, c, x, unsafe;
2281
2282	for (c = 0; c < 4; c++)
2283		m[c] = c;
2284
2285	/* Swap as long as a dst component written earlier is depended on
2286	 * by one written later, but the next one isn't depended on by it.
2287	 */
2288	for (c = 0; c < 3; c++) {
2289		if (rdep[m[c + 1]] & (1 << m[c]))
2290			continue; /* if next one is depended on by us */
2291		for (i = c + 1; i < 4; i++)
2292			/* if we are depended on by a later one */
2293			if (rdep[m[c]] & (1 << m[i]))
2294				break;
2295		if (i == 4)
2296			continue;
2297		/* now, swap */
2298		x = m[c];
2299		m[c] = m[c + 1];
2300		m[c + 1] = x;
2301
2302		/* restart */
2303		c = 0;
2304	}
2305
2306	/* mark dependencies that could not be resolved by reordering */
2307	for (i = 0; i < 3; ++i)
2308		for (c = i + 1; c < 4; ++c)
2309			if (rdep[m[i]] & (1 << m[c]))
2310				unsafe |= (1 << i);
2311
2312	/* NOTE: $unsafe is with respect to order, not component */
2313	return unsafe;
2314}
2315
2316/* Select a suitable dst register for broadcasting scalar results,
2317 * or return NULL if we have to allocate an extra TEMP.
2318 *
2319 * If e.g. only 1 component is written, we may also emit the final
2320 * result to a write-only register.
2321 */
2322static struct nv50_reg *
2323tgsi_broadcast_dst(struct nv50_pc *pc,
2324		   const struct tgsi_full_dst_register *fd, unsigned mask)
2325{
2326	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
2327		int c = ffs(~mask & fd->DstRegister.WriteMask);
2328		if (c)
2329			return tgsi_dst(pc, c - 1, fd);
2330	} else {
2331		int c = ffs(fd->DstRegister.WriteMask) - 1;
2332		if ((1 << c) == fd->DstRegister.WriteMask)
2333			return tgsi_dst(pc, c, fd);
2334	}
2335
2336	return NULL;
2337}
2338
2339/* Scan source swizzles and return a bitmask indicating dst regs that
2340 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
2341 */
2342static unsigned
2343nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
2344		       unsigned rdep[4])
2345{
2346	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
2347	const struct tgsi_full_src_register *fs;
2348	unsigned i, deqs = 0;
2349
2350	for (i = 0; i < 4; ++i)
2351		rdep[i] = 0;
2352
2353	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
2354		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
2355		boolean neg_supp = negate_supported(insn, i);
2356
2357		fs = &insn->FullSrcRegisters[i];
2358		if (fs->SrcRegister.File != fd->DstRegister.File ||
2359		    fs->SrcRegister.Index != fd->DstRegister.Index)
2360			continue;
2361
2362		for (chn = 0; chn < 4; ++chn) {
2363			unsigned s, c;
2364
2365			if (!(mask & (1 << chn))) /* src is not read */
2366				continue;
2367			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
2368			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
2369
2370			if (!(fd->DstRegister.WriteMask & (1 << c)))
2371				continue;
2372
2373			/* no danger if src is copied to TEMP first */
2374			if ((s != TGSI_UTIL_SIGN_KEEP) &&
2375			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
2376				continue;
2377
2378			rdep[c] |= nv50_tgsi_dst_revdep(
2379				insn->Instruction.Opcode, i, chn);
2380			deqs |= (1 << c);
2381		}
2382	}
2383
2384	return deqs;
2385}
2386
2387static boolean
2388nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2389{
2390	struct tgsi_full_instruction insn = tok->FullInstruction;
2391	const struct tgsi_full_dst_register *fd;
2392	unsigned i, deqs, rdep[4], m[4];
2393
2394	fd = &tok->FullInstruction.FullDstRegisters[0];
2395	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2396
2397	if (is_scalar_op(insn.Instruction.Opcode)) {
2398		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2399		if (!pc->r_brdc)
2400			pc->r_brdc = temp_temp(pc);
2401		return nv50_program_tx_insn(pc, &insn);
2402	}
2403	pc->r_brdc = NULL;
2404
2405	if (!deqs)
2406		return nv50_program_tx_insn(pc, &insn);
2407
2408	deqs = nv50_revdep_reorder(m, rdep);
2409
2410	for (i = 0; i < 4; ++i) {
2411		assert(pc->r_dst[m[i]] == NULL);
2412
2413		insn.FullDstRegisters[0].DstRegister.WriteMask =
2414			fd->DstRegister.WriteMask & (1 << m[i]);
2415
2416		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
2417			continue;
2418
2419		if (deqs & (1 << i))
2420			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2421
2422		if (!nv50_program_tx_insn(pc, &insn))
2423			return FALSE;
2424	}
2425
2426	for (i = 0; i < 4; i++) {
2427		struct nv50_reg *reg = pc->r_dst[i];
2428		if (!reg)
2429			continue;
2430		pc->r_dst[i] = NULL;
2431
2432		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2433			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2434		else
2435			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2436		free_temp(pc, reg);
2437	}
2438
2439	return TRUE;
2440}
2441
2442static void
2443load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2444{
2445	struct nv50_reg *iv, **ppiv;
2446	unsigned mode = pc->interp_mode[reg->index];
2447
2448	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2449	iv = *ppiv;
2450
2451	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2452		iv = *ppiv = alloc_temp(pc, NULL);
2453		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2454
2455		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2456		emit_flop(pc, 0, iv, iv);
2457
2458		/* XXX: when loading interpolants dynamically, move these
2459		 * to the program head, or make sure it can't be skipped.
2460		 */
2461	}
2462
2463	emit_interp(pc, reg, iv, mode);
2464}
2465
2466static boolean
2467nv50_program_tx_prep(struct nv50_pc *pc)
2468{
2469	struct tgsi_parse_context tp;
2470	struct nv50_program *p = pc->p;
2471	boolean ret = FALSE;
2472	unsigned i, c, flat_nr = 0;
2473
2474	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2475	while (!tgsi_parse_end_of_tokens(&tp)) {
2476		const union tgsi_full_token *tok = &tp.FullToken;
2477
2478		tgsi_parse_token(&tp);
2479		switch (tok->Token.Type) {
2480		case TGSI_TOKEN_TYPE_IMMEDIATE:
2481		{
2482			const struct tgsi_full_immediate *imm =
2483				&tp.FullToken.FullImmediate;
2484
2485			ctor_immd(pc, imm->u[0].Float,
2486				      imm->u[1].Float,
2487				      imm->u[2].Float,
2488				      imm->u[3].Float);
2489		}
2490			break;
2491		case TGSI_TOKEN_TYPE_DECLARATION:
2492		{
2493			const struct tgsi_full_declaration *d;
2494			unsigned si, last, first, mode;
2495
2496			d = &tp.FullToken.FullDeclaration;
2497			first = d->DeclarationRange.First;
2498			last = d->DeclarationRange.Last;
2499
2500			switch (d->Declaration.File) {
2501			case TGSI_FILE_TEMPORARY:
2502				break;
2503			case TGSI_FILE_OUTPUT:
2504				if (!d->Declaration.Semantic ||
2505				    p->type == PIPE_SHADER_FRAGMENT)
2506					break;
2507
2508				si = d->Semantic.SemanticIndex;
2509				switch (d->Semantic.SemanticName) {
2510				case TGSI_SEMANTIC_BCOLOR:
2511					p->cfg.two_side[si].hw = first;
2512					if (p->cfg.io_nr > first)
2513						p->cfg.io_nr = first;
2514					break;
2515				case TGSI_SEMANTIC_PSIZE:
2516					p->cfg.psiz = first;
2517					if (p->cfg.io_nr > first)
2518						p->cfg.io_nr = first;
2519					break;
2520					/*
2521				case TGSI_SEMANTIC_CLIP_DISTANCE:
2522					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2523					break;
2524					*/
2525				default:
2526					break;
2527				}
2528				break;
2529			case TGSI_FILE_INPUT:
2530			{
2531				if (p->type != PIPE_SHADER_FRAGMENT)
2532					break;
2533
2534				switch (d->Declaration.Interpolate) {
2535				case TGSI_INTERPOLATE_CONSTANT:
2536					mode = INTERP_FLAT;
2537					flat_nr++;
2538					break;
2539				case TGSI_INTERPOLATE_PERSPECTIVE:
2540					mode = INTERP_PERSPECTIVE;
2541					p->cfg.regs[1] |= 0x08 << 24;
2542					break;
2543				default:
2544					mode = INTERP_LINEAR;
2545					break;
2546				}
2547				if (d->Declaration.Centroid)
2548					mode |= INTERP_CENTROID;
2549
2550				assert(last < 32);
2551				for (i = first; i <= last; i++)
2552					pc->interp_mode[i] = mode;
2553			}
2554				break;
2555			case TGSI_FILE_ADDRESS:
2556			case TGSI_FILE_CONSTANT:
2557			case TGSI_FILE_SAMPLER:
2558				break;
2559			default:
2560				NOUVEAU_ERR("bad decl file %d\n",
2561					    d->Declaration.File);
2562				goto out_err;
2563			}
2564		}
2565			break;
2566		case TGSI_TOKEN_TYPE_INSTRUCTION:
2567			pc->insn_nr++;
2568			prep_inspect_insn(pc, &tok->FullInstruction);
2569			break;
2570		default:
2571			break;
2572		}
2573	}
2574
2575	if (p->type == PIPE_SHADER_VERTEX) {
2576		int rid = 0;
2577
2578		for (i = 0; i < pc->attr_nr * 4; ++i) {
2579			if (pc->attr[i].acc) {
2580				pc->attr[i].hw = rid++;
2581				p->cfg.attr[i / 32] |= 1 << (i % 32);
2582			}
2583		}
2584
2585		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2586			p->cfg.io[i].hw = rid;
2587			p->cfg.io[i].id_vp = i;
2588
2589			for (c = 0; c < 4; ++c) {
2590				int n = i * 4 + c;
2591				if (!pc->result[n].acc)
2592					continue;
2593				pc->result[n].hw = rid++;
2594				p->cfg.io[i].mask |= 1 << c;
2595			}
2596		}
2597
2598		for (c = 0; c < 2; ++c)
2599			if (p->cfg.two_side[c].hw < 0x40)
2600				p->cfg.two_side[c] = p->cfg.io[
2601					p->cfg.two_side[c].hw];
2602
2603		if (p->cfg.psiz < 0x40)
2604			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2605	} else
2606	if (p->type == PIPE_SHADER_FRAGMENT) {
2607		int rid, aid;
2608		unsigned n = 0, m = pc->attr_nr - flat_nr;
2609
2610		int base = (TGSI_SEMANTIC_POSITION ==
2611			    p->info.input_semantic_name[0]) ? 0 : 1;
2612
2613		/* non-flat interpolants have to be mapped to
2614		 * the lower hardware IDs, so sort them:
2615		 */
2616		for (i = 0; i < pc->attr_nr; i++) {
2617			if (pc->interp_mode[i] == INTERP_FLAT) {
2618				p->cfg.io[m].id_vp = i + base;
2619				p->cfg.io[m++].id_fp = i;
2620			} else {
2621				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2622					p->cfg.io[n].linear = TRUE;
2623				p->cfg.io[n].id_vp = i + base;
2624				p->cfg.io[n++].id_fp = i;
2625			}
2626		}
2627
2628		if (!base) /* set w-coordinate mask from perspective interp */
2629			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2630
2631		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2632			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2633
2634		for (n = 0; n < pc->attr_nr; ++n) {
2635			p->cfg.io[n].hw = rid = aid;
2636			i = p->cfg.io[n].id_fp;
2637
2638			for (c = 0; c < 4; ++c) {
2639				if (!pc->attr[i * 4 + c].acc)
2640					continue;
2641				pc->attr[i * 4 + c].rhw = rid++;
2642				p->cfg.io[n].mask |= 1 << c;
2643
2644				load_interpolant(pc, &pc->attr[i * 4 + c]);
2645			}
2646			aid += popcnt4(p->cfg.io[n].mask);
2647		}
2648
2649		if (!base)
2650			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2651
2652		m = popcnt4(p->cfg.regs[1] >> 24);
2653
2654		/* set count of non-position inputs and of non-flat
2655		 * non-position inputs for FP_INTERPOLANT_CTRL
2656		 */
2657		p->cfg.regs[1] |= aid - m;
2658
2659		if (flat_nr) {
2660			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2661			p->cfg.regs[1] |= (i - m) << 16;
2662		} else
2663			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2664
2665		/* mark color semantic for light-twoside */
2666		n = 0x40;
2667		for (i = 0; i < pc->attr_nr; i++) {
2668			ubyte si, sn;
2669
2670			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2671			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2672
2673			if (sn == TGSI_SEMANTIC_COLOR) {
2674				p->cfg.two_side[si] = p->cfg.io[i];
2675
2676				/* increase colour count */
2677				p->cfg.regs[0] += popcnt4(
2678					p->cfg.two_side[si].mask) << 16;
2679
2680				n = MIN2(n, p->cfg.io[i].hw - m);
2681			}
2682		}
2683		if (n < 0x40)
2684			p->cfg.regs[0] += n;
2685
2686		/* Initialize FP results:
2687		 * FragDepth is always first TGSI and last hw output
2688		 */
2689		i = p->info.writes_z ? 4 : 0;
2690		for (rid = 0; i < pc->result_nr * 4; i++)
2691			pc->result[i].rhw = rid++;
2692		if (p->info.writes_z)
2693			pc->result[2].rhw = rid;
2694
2695		p->cfg.high_result = rid;
2696
2697		/* separate/different colour results for MRTs ? */
2698		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
2699			p->cfg.regs[2] |= 1;
2700	}
2701
2702	if (pc->immd_nr) {
2703		int rid = 0;
2704
2705		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2706		if (!pc->immd)
2707			goto out_err;
2708
2709		for (i = 0; i < pc->immd_nr; i++) {
2710			for (c = 0; c < 4; c++, rid++)
2711				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2712		}
2713	}
2714
2715	ret = TRUE;
2716out_err:
2717	if (pc->iv_p)
2718		free_temp(pc, pc->iv_p);
2719	if (pc->iv_c)
2720		free_temp(pc, pc->iv_c);
2721
2722	tgsi_parse_free(&tp);
2723	return ret;
2724}
2725
2726static void
2727free_nv50_pc(struct nv50_pc *pc)
2728{
2729	if (pc->immd)
2730		FREE(pc->immd);
2731	if (pc->param)
2732		FREE(pc->param);
2733	if (pc->result)
2734		FREE(pc->result);
2735	if (pc->attr)
2736		FREE(pc->attr);
2737	if (pc->temp)
2738		FREE(pc->temp);
2739
2740	FREE(pc);
2741}
2742
2743static boolean
2744ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2745{
2746	int i, c;
2747	unsigned rtype[2] = { P_ATTR, P_RESULT };
2748
2749	pc->p = p;
2750	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2751	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2752	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2753	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2754	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
2755	assert(pc->addr_nr <= 2);
2756
2757	p->cfg.high_temp = 4;
2758
2759	p->cfg.two_side[0].hw = 0x40;
2760	p->cfg.two_side[1].hw = 0x40;
2761
2762	switch (p->type) {
2763	case PIPE_SHADER_VERTEX:
2764		p->cfg.psiz = 0x40;
2765		p->cfg.clpd = 0x40;
2766		p->cfg.io_nr = pc->result_nr;
2767		break;
2768	case PIPE_SHADER_FRAGMENT:
2769		rtype[0] = rtype[1] = P_TEMP;
2770
2771		p->cfg.regs[0] = 0x01000004;
2772		p->cfg.io_nr = pc->attr_nr;
2773
2774		if (p->info.writes_z) {
2775			p->cfg.regs[2] |= 0x00000100;
2776			p->cfg.regs[3] |= 0x00000011;
2777		}
2778		if (p->info.uses_kill)
2779			p->cfg.regs[2] |= 0x00100000;
2780		break;
2781	}
2782
2783	if (pc->temp_nr) {
2784		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2785		if (!pc->temp)
2786			return FALSE;
2787
2788		for (i = 0; i < pc->temp_nr * 4; ++i)
2789			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2790	}
2791
2792	if (pc->attr_nr) {
2793		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2794		if (!pc->attr)
2795			return FALSE;
2796
2797		for (i = 0; i < pc->attr_nr * 4; ++i)
2798			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2799	}
2800
2801	if (pc->result_nr) {
2802		unsigned nr = pc->result_nr * 4;
2803
2804		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2805		if (!pc->result)
2806			return FALSE;
2807
2808		for (i = 0; i < nr; ++i)
2809			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2810	}
2811
2812	if (pc->param_nr) {
2813		int rid = 0;
2814
2815		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2816		if (!pc->param)
2817			return FALSE;
2818
2819		for (i = 0; i < pc->param_nr; ++i)
2820			for (c = 0; c < 4; ++c, ++rid)
2821				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2822	}
2823
2824	if (pc->addr_nr) {
2825		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
2826		if (!pc->addr)
2827			return FALSE;
2828	}
2829	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
2830		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
2831
2832	return TRUE;
2833}
2834
2835static void
2836nv50_fp_move_results(struct nv50_pc *pc)
2837{
2838	struct nv50_reg reg;
2839	unsigned i;
2840
2841	ctor_reg(&reg, P_TEMP, -1, -1);
2842
2843	for (i = 0; i < pc->result_nr * 4; ++i) {
2844		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2845			continue;
2846		if (pc->result[i].rhw != pc->result[i].hw) {
2847			reg.hw = pc->result[i].rhw;
2848			emit_mov(pc, &reg, &pc->result[i]);
2849		}
2850	}
2851}
2852
2853static void
2854nv50_program_fixup_insns(struct nv50_pc *pc)
2855{
2856	struct nv50_program_exec *e, *prev = NULL, **bra_list;
2857	unsigned i, n, pos;
2858
2859	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2860
2861	/* Collect branch instructions, we need to adjust their offsets
2862	 * when converting 32 bit instructions to 64 bit ones
2863	 */
2864	for (n = 0, e = pc->p->exec_head; e; e = e->next)
2865		if (e->param.index >= 0 && !e->param.mask)
2866			bra_list[n++] = e;
2867
2868	/* Make sure we don't have any single 32 bit instructions. */
2869	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2870		pos += is_long(e) ? 2 : 1;
2871
2872		if ((pos & 1) && (!e->next || is_long(e->next))) {
2873			for (i = 0; i < n; ++i)
2874				if (bra_list[i]->param.index >= pos)
2875					bra_list[i]->param.index += 1;
2876			convert_to_long(pc, e);
2877			++pos;
2878		}
2879		if (e->next)
2880			prev = e;
2881	}
2882
2883	assert(!is_immd(pc->p->exec_head));
2884	assert(!is_immd(pc->p->exec_tail));
2885
2886	/* last instruction must be long so it can have the end bit set */
2887	if (!is_long(pc->p->exec_tail)) {
2888		convert_to_long(pc, pc->p->exec_tail);
2889		if (prev)
2890			convert_to_long(pc, prev);
2891	}
2892	assert(!(pc->p->exec_tail->inst[1] & 2));
2893	/* set the end-bit */
2894	pc->p->exec_tail->inst[1] |= 1;
2895
2896	FREE(bra_list);
2897}
2898
2899static boolean
2900nv50_program_tx(struct nv50_program *p)
2901{
2902	struct tgsi_parse_context parse;
2903	struct nv50_pc *pc;
2904	boolean ret;
2905
2906	pc = CALLOC_STRUCT(nv50_pc);
2907	if (!pc)
2908		return FALSE;
2909
2910	ret = ctor_nv50_pc(pc, p);
2911	if (ret == FALSE)
2912		goto out_cleanup;
2913
2914	ret = nv50_program_tx_prep(pc);
2915	if (ret == FALSE)
2916		goto out_cleanup;
2917
2918	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2919	while (!tgsi_parse_end_of_tokens(&parse)) {
2920		const union tgsi_full_token *tok = &parse.FullToken;
2921
2922		/* don't allow half insn/immd on first and last instruction */
2923		pc->allow32 = TRUE;
2924		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2925			pc->allow32 = FALSE;
2926
2927		tgsi_parse_token(&parse);
2928
2929		switch (tok->Token.Type) {
2930		case TGSI_TOKEN_TYPE_INSTRUCTION:
2931			++pc->insn_cur;
2932			ret = nv50_tgsi_insn(pc, tok);
2933			if (ret == FALSE)
2934				goto out_err;
2935			break;
2936		default:
2937			break;
2938		}
2939	}
2940
2941	if (pc->p->type == PIPE_SHADER_FRAGMENT)
2942		nv50_fp_move_results(pc);
2943
2944	nv50_program_fixup_insns(pc);
2945
2946	p->param_nr = pc->param_nr * 4;
2947	p->immd_nr = pc->immd_nr * 4;
2948	p->immd = pc->immd_buf;
2949
2950out_err:
2951	tgsi_parse_free(&parse);
2952
2953out_cleanup:
2954	free_nv50_pc(pc);
2955	return ret;
2956}
2957
2958static void
2959nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2960{
2961	if (nv50_program_tx(p) == FALSE)
2962		assert(0);
2963	p->translated = TRUE;
2964}
2965
2966static void
2967nv50_program_upload_data(struct nv50_context *nv50, float *map,
2968			unsigned start, unsigned count, unsigned cbuf)
2969{
2970	struct nouveau_channel *chan = nv50->screen->base.channel;
2971	struct nouveau_grobj *tesla = nv50->screen->tesla;
2972
2973	while (count) {
2974		unsigned nr = count > 2047 ? 2047 : count;
2975
2976		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2977		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2978		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2979		OUT_RINGp (chan, map, nr);
2980
2981		map += nr;
2982		start += nr;
2983		count -= nr;
2984	}
2985}
2986
2987static void
2988nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2989{
2990	struct pipe_screen *pscreen = nv50->pipe.screen;
2991
2992	if (!p->data[0] && p->immd_nr) {
2993		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2994
2995		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2996			while (heap->next && heap->size < p->immd_nr) {
2997				struct nv50_program *evict = heap->next->priv;
2998				nouveau_resource_free(&evict->data[0]);
2999			}
3000
3001			if (nouveau_resource_alloc(heap, p->immd_nr, p,
3002						   &p->data[0]))
3003				assert(0);
3004		}
3005
3006		/* immediates only need to be uploaded again when freed */
3007		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
3008					 p->immd_nr, NV50_CB_PMISC);
3009	}
3010
3011	assert(p->param_nr <= 512);
3012
3013	if (p->param_nr) {
3014		unsigned cb;
3015		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
3016					     PIPE_BUFFER_USAGE_CPU_READ);
3017
3018		if (p->type == PIPE_SHADER_VERTEX)
3019			cb = NV50_CB_PVP;
3020		else
3021			cb = NV50_CB_PFP;
3022
3023		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
3024		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
3025	}
3026}
3027
3028static void
3029nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
3030{
3031	struct nouveau_channel *chan = nv50->screen->base.channel;
3032	struct nv50_program_exec *e;
3033	uint32_t *up, i;
3034	boolean upload = FALSE;
3035
3036	if (!p->bo) {
3037		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
3038			       p->exec_size * 4, &p->bo);
3039		upload = TRUE;
3040	}
3041
3042	if (p->data[0] && p->data[0]->start != p->data_start[0])
3043		upload = TRUE;
3044
3045	if (!upload)
3046		return;
3047
3048	up = MALLOC(p->exec_size * 4);
3049
3050	for (i = 0, e = p->exec_head; e; e = e->next) {
3051		unsigned ei, ci, bs;
3052
3053		if (e->param.index >= 0 && e->param.mask) {
3054			bs = (e->inst[1] >> 22) & 0x07;
3055			assert(bs < 2);
3056			ei = e->param.shift >> 5;
3057			ci = e->param.index;
3058			if (bs == 0)
3059				ci += p->data[bs]->start;
3060
3061			e->inst[ei] &= ~e->param.mask;
3062			e->inst[ei] |= (ci << e->param.shift);
3063		} else
3064		if (e->param.index >= 0) {
3065			/* zero mask means param is a jump/branch offset */
3066			assert(!(e->param.index & 1));
3067			/* seem to be 8 byte steps */
3068			ei = (e->param.index >> 1) + 0 /* START_ID */;
3069
3070			e->inst[0] &= 0xf0000fff;
3071			e->inst[0] |= ei << 12;
3072		}
3073
3074		up[i++] = e->inst[0];
3075		if (is_long(e))
3076			up[i++] = e->inst[1];
3077	}
3078	assert(i == p->exec_size);
3079
3080	if (p->data[0])
3081		p->data_start[0] = p->data[0]->start;
3082
3083#ifdef NV50_PROGRAM_DUMP
3084	NOUVEAU_ERR("-------\n");
3085	for (e = p->exec_head; e; e = e->next) {
3086		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
3087		if (is_long(e))
3088			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
3089	}
3090#endif
3091	nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
3092			 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
3093			 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
3094			 0, 0, p->exec_size * 4, 1, 1);
3095
3096	FREE(up);
3097}
3098
3099void
3100nv50_vertprog_validate(struct nv50_context *nv50)
3101{
3102	struct nouveau_grobj *tesla = nv50->screen->tesla;
3103	struct nv50_program *p = nv50->vertprog;
3104	struct nouveau_stateobj *so;
3105
3106	if (!p->translated) {
3107		nv50_program_validate(nv50, p);
3108		if (!p->translated)
3109			assert(0);
3110	}
3111
3112	nv50_program_validate_data(nv50, p);
3113	nv50_program_validate_code(nv50, p);
3114
3115	so = so_new(13, 2);
3116	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
3117	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3118		      NOUVEAU_BO_HIGH, 0, 0);
3119	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3120		      NOUVEAU_BO_LOW, 0, 0);
3121	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
3122	so_data  (so, p->cfg.attr[0]);
3123	so_data  (so, p->cfg.attr[1]);
3124	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
3125	so_data  (so, p->cfg.high_result);
3126	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
3127	so_data  (so, p->cfg.high_result); //8);
3128	so_data  (so, p->cfg.high_temp);
3129	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
3130	so_data  (so, 0); /* program start offset */
3131	so_ref(so, &nv50->state.vertprog);
3132	so_ref(NULL, &so);
3133}
3134
3135void
3136nv50_fragprog_validate(struct nv50_context *nv50)
3137{
3138	struct nouveau_grobj *tesla = nv50->screen->tesla;
3139	struct nv50_program *p = nv50->fragprog;
3140	struct nouveau_stateobj *so;
3141
3142	if (!p->translated) {
3143		nv50_program_validate(nv50, p);
3144		if (!p->translated)
3145			assert(0);
3146	}
3147
3148	nv50_program_validate_data(nv50, p);
3149	nv50_program_validate_code(nv50, p);
3150
3151	so = so_new(64, 2);
3152	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
3153	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3154		      NOUVEAU_BO_HIGH, 0, 0);
3155	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
3156		      NOUVEAU_BO_LOW, 0, 0);
3157	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
3158	so_data  (so, p->cfg.high_temp);
3159	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
3160	so_data  (so, p->cfg.high_result);
3161	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
3162	so_data  (so, p->cfg.regs[2]);
3163	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
3164	so_data  (so, p->cfg.regs[3]);
3165	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
3166	so_data  (so, 0); /* program start offset */
3167	so_ref(so, &nv50->state.fragprog);
3168	so_ref(NULL, &so);
3169}
3170
3171static void
3172nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
3173{
3174	struct nv50_program *fp = nv50->fragprog;
3175	struct nv50_program *vp = nv50->vertprog;
3176	unsigned i, c, m = base;
3177
3178	/* XXX: This can't work correctly in all cases yet, we either
3179	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
3180	 * to be per FP input instead of per VP output
3181	 */
3182	memset(pntc, 0, 8 * sizeof(uint32_t));
3183
3184	for (i = 0; i < fp->cfg.io_nr; i++) {
3185		uint8_t sn, si;
3186		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
3187		unsigned n = popcnt4(fp->cfg.io[i].mask);
3188
3189		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
3190			m += n;
3191			continue;
3192		}
3193
3194		sn = vp->info.input_semantic_name[j];
3195		si = vp->info.input_semantic_index[j];
3196
3197		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
3198			ubyte mode =
3199				nv50->rasterizer->pipe.sprite_coord_mode[si];
3200
3201			if (mode == PIPE_SPRITE_COORD_NONE) {
3202				m += n;
3203				continue;
3204			}
3205		}
3206
3207		/* this is either PointCoord or replaced by sprite coords */
3208		for (c = 0; c < 4; c++) {
3209			if (!(fp->cfg.io[i].mask & (1 << c)))
3210				continue;
3211			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
3212			++m;
3213		}
3214	}
3215}
3216
3217static int
3218nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
3219	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
3220{
3221	int c;
3222	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
3223	uint8_t *map = (uint8_t *)p_map;
3224
3225	for (c = 0; c < 4; ++c) {
3226		if (mf & 1) {
3227			if (fpi->linear == TRUE)
3228				lin[mid / 32] |= 1 << (mid % 32);
3229			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
3230		}
3231
3232		oid += mv & 1;
3233		mf >>= 1;
3234		mv >>= 1;
3235	}
3236
3237	return mid;
3238}
3239
3240void
3241nv50_linkage_validate(struct nv50_context *nv50)
3242{
3243	struct nouveau_grobj *tesla = nv50->screen->tesla;
3244	struct nv50_program *vp = nv50->vertprog;
3245	struct nv50_program *fp = nv50->fragprog;
3246	struct nouveau_stateobj *so;
3247	struct nv50_sreg4 dummy, *vpo;
3248	int i, n, c, m = 0;
3249	uint32_t map[16], lin[4], reg[5], pcrd[8];
3250
3251	memset(map, 0, sizeof(map));
3252	memset(lin, 0, sizeof(lin));
3253
3254	reg[1] = 0x00000004; /* low and high clip distance map ids */
3255	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
3256	reg[3] = 0x00000000; /* point size map id & enable */
3257	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
3258	reg[4] = fp->cfg.regs[1]; /* interpolant info */
3259
3260	dummy.linear = FALSE;
3261	dummy.mask = 0xf; /* map all components of HPOS */
3262	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
3263
3264	dummy.mask = 0x0;
3265
3266	if (vp->cfg.clpd < 0x40) {
3267		for (c = 0; c < vp->cfg.clpd_nr; ++c)
3268			map[m++] = vp->cfg.clpd + c;
3269		reg[1] = (m << 8);
3270	}
3271
3272	reg[0] |= m << 8; /* adjust BFC0 id */
3273
3274	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
3275	if (nv50->rasterizer->pipe.light_twoside) {
3276		vpo = &vp->cfg.two_side[0];
3277
3278		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
3279		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
3280	}
3281
3282	reg[0] += m - 4; /* adjust FFC0 id */
3283	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
3284
3285	i = 0;
3286	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
3287		i = 1;
3288	for (; i < fp->cfg.io_nr; i++) {
3289		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
3290		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
3291
3292		n = fp->cfg.io[i].id_vp;
3293		if (n >= vp->cfg.io_nr ||
3294		    vp->info.output_semantic_name[n] != sn ||
3295		    vp->info.output_semantic_index[n] != si)
3296			vpo = &dummy;
3297		else
3298			vpo = &vp->cfg.io[n];
3299
3300		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
3301	}
3302
3303	if (nv50->rasterizer->pipe.point_size_per_vertex) {
3304		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
3305		reg[3] = (m++ << 4) | 1;
3306	}
3307
3308	/* now fill the stateobj */
3309	so = so_new(64, 0);
3310
3311	n = (m + 3) / 4;
3312	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
3313	so_data  (so, m);
3314	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
3315	so_datap (so, map, n);
3316
3317	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
3318	so_datap (so, reg, 4);
3319
3320	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
3321	so_data  (so, reg[4]);
3322
3323	so_method(so, tesla, 0x1540, 4);
3324	so_datap (so, lin, 4);
3325
3326	if (nv50->rasterizer->pipe.point_sprite) {
3327		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
3328
3329		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
3330		so_datap (so, pcrd, 8);
3331	}
3332
3333        so_ref(so, &nv50->state.programs);
3334        so_ref(NULL, &so);
3335}
3336
3337void
3338nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
3339{
3340	while (p->exec_head) {
3341		struct nv50_program_exec *e = p->exec_head;
3342
3343		p->exec_head = e->next;
3344		FREE(e);
3345	}
3346	p->exec_tail = NULL;
3347	p->exec_size = 0;
3348
3349	nouveau_bo_ref(NULL, &p->bo);
3350
3351	nouveau_resource_free(&p->data[0]);
3352
3353	p->translated = 0;
3354}
3355