nv50_program.c revision 6516594c8eec1088ee59e7c3254b2fdced2ff04b
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93struct nv50_pc {
94	struct nv50_program *p;
95
96	/* hw resources */
97	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	/* broadcast and destination replacement regs */
116	struct nv50_reg *r_brdc;
117	struct nv50_reg *r_dst[4];
118
119	unsigned interp_mode[32];
120	/* perspective interpolation registers */
121	struct nv50_reg *iv_p;
122	struct nv50_reg *iv_c;
123
124	/* current instruction and total number of insns */
125	unsigned insn_cur;
126	unsigned insn_nr;
127
128	boolean allow32;
129};
130
131static INLINE void
132ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
133{
134	reg->type = type;
135	reg->index = index;
136	reg->hw = hw;
137	reg->neg = 0;
138	reg->rhw = -1;
139	reg->acc = 0;
140}
141
142static INLINE unsigned
143popcnt4(uint32_t val)
144{
145	static const unsigned cnt[16]
146	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
147	return cnt[val & 0xf];
148}
149
150static void
151alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
152{
153	int i = 0;
154
155	if (reg->type == P_RESULT) {
156		if (pc->p->cfg.high_result < (reg->hw + 1))
157			pc->p->cfg.high_result = reg->hw + 1;
158	}
159
160	if (reg->type != P_TEMP)
161		return;
162
163	if (reg->hw >= 0) {
164		/*XXX: do this here too to catch FP temp-as-attr usage..
165		 *     not clean, but works */
166		if (pc->p->cfg.high_temp < (reg->hw + 1))
167			pc->p->cfg.high_temp = reg->hw + 1;
168		return;
169	}
170
171	if (reg->rhw != -1) {
172		/* try to allocate temporary with index rhw first */
173		if (!(pc->r_temp[reg->rhw])) {
174			pc->r_temp[reg->rhw] = reg;
175			reg->hw = reg->rhw;
176			if (pc->p->cfg.high_temp < (reg->rhw + 1))
177				pc->p->cfg.high_temp = reg->rhw + 1;
178			return;
179		}
180		/* make sure we don't get things like $r0 needs to go
181		 * in $r1 and $r1 in $r0
182		 */
183		i = pc->result_nr * 4;
184	}
185
186	for (; i < NV50_SU_MAX_TEMP; i++) {
187		if (!(pc->r_temp[i])) {
188			pc->r_temp[i] = reg;
189			reg->hw = i;
190			if (pc->p->cfg.high_temp < (i + 1))
191				pc->p->cfg.high_temp = i + 1;
192			return;
193		}
194	}
195
196	assert(0);
197}
198
199static struct nv50_reg *
200alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
201{
202	struct nv50_reg *r;
203	int i;
204
205	if (dst && dst->type == P_TEMP && dst->hw == -1)
206		return dst;
207
208	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
209		if (!pc->r_temp[i]) {
210			r = MALLOC_STRUCT(nv50_reg);
211			ctor_reg(r, P_TEMP, -1, i);
212			pc->r_temp[i] = r;
213			return r;
214		}
215	}
216
217	assert(0);
218	return NULL;
219}
220
221/* Assign the hw of the discarded temporary register src
222 * to the tgsi register dst and free src.
223 */
224static void
225assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
226{
227	assert(src->index == -1 && src->hw != -1);
228
229	if (dst->hw != -1)
230		pc->r_temp[dst->hw] = NULL;
231	pc->r_temp[src->hw] = dst;
232	dst->hw = src->hw;
233
234	FREE(src);
235}
236
237/* release the hardware resource held by r */
238static void
239release_hw(struct nv50_pc *pc, struct nv50_reg *r)
240{
241	assert(r->type == P_TEMP);
242	if (r->hw == -1)
243		return;
244
245	assert(pc->r_temp[r->hw] == r);
246	pc->r_temp[r->hw] = NULL;
247
248	r->acc = 0;
249	if (r->index == -1)
250		FREE(r);
251}
252
253static void
254free_temp(struct nv50_pc *pc, struct nv50_reg *r)
255{
256	if (r->index == -1) {
257		unsigned hw = r->hw;
258
259		FREE(pc->r_temp[hw]);
260		pc->r_temp[hw] = NULL;
261	}
262}
263
264static int
265alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
266{
267	int i;
268
269	if ((idx + 4) >= NV50_SU_MAX_TEMP)
270		return 1;
271
272	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
273	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
274		return alloc_temp4(pc, dst, idx + 4);
275
276	for (i = 0; i < 4; i++) {
277		dst[i] = MALLOC_STRUCT(nv50_reg);
278		ctor_reg(dst[i], P_TEMP, -1, idx + i);
279		pc->r_temp[idx + i] = dst[i];
280	}
281
282	return 0;
283}
284
285static void
286free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
287{
288	int i;
289
290	for (i = 0; i < 4; i++)
291		free_temp(pc, reg[i]);
292}
293
294static struct nv50_reg *
295temp_temp(struct nv50_pc *pc)
296{
297	if (pc->temp_temp_nr >= 16)
298		assert(0);
299
300	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
301	return pc->temp_temp[pc->temp_temp_nr++];
302}
303
304static void
305kill_temp_temp(struct nv50_pc *pc)
306{
307	int i;
308
309	for (i = 0; i < pc->temp_temp_nr; i++)
310		free_temp(pc, pc->temp_temp[i]);
311	pc->temp_temp_nr = 0;
312}
313
314static int
315ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
316{
317	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
318			       (pc->immd_nr + 1) * 4 * sizeof(float));
319	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
320	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
321	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
322	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
323
324	return pc->immd_nr++;
325}
326
327static struct nv50_reg *
328alloc_immd(struct nv50_pc *pc, float f)
329{
330	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
331	unsigned hw;
332
333	for (hw = 0; hw < pc->immd_nr * 4; hw++)
334		if (pc->immd_buf[hw] == f)
335			break;
336
337	if (hw == pc->immd_nr * 4)
338		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
339
340	ctor_reg(r, P_IMMD, -1, hw);
341	return r;
342}
343
344static struct nv50_program_exec *
345exec(struct nv50_pc *pc)
346{
347	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
348
349	e->param.index = -1;
350	return e;
351}
352
353static void
354emit(struct nv50_pc *pc, struct nv50_program_exec *e)
355{
356	struct nv50_program *p = pc->p;
357
358	if (p->exec_tail)
359		p->exec_tail->next = e;
360	if (!p->exec_head)
361		p->exec_head = e;
362	p->exec_tail = e;
363	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
364}
365
366static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
367
368static boolean
369is_long(struct nv50_program_exec *e)
370{
371	if (e->inst[0] & 1)
372		return TRUE;
373	return FALSE;
374}
375
376static boolean
377is_immd(struct nv50_program_exec *e)
378{
379	if (is_long(e) && (e->inst[1] & 3) == 3)
380		return TRUE;
381	return FALSE;
382}
383
384static INLINE void
385set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
386	 struct nv50_program_exec *e)
387{
388	set_long(pc, e);
389	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
390	e->inst[1] |= (pred << 7) | (idx << 12);
391}
392
393static INLINE void
394set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
395	    struct nv50_program_exec *e)
396{
397	set_long(pc, e);
398	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
399	e->inst[1] |= (idx << 4) | (on << 6);
400}
401
402static INLINE void
403set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
404{
405	if (is_long(e))
406		return;
407
408	e->inst[0] |= 1;
409	set_pred(pc, 0xf, 0, e);
410	set_pred_wr(pc, 0, 0, e);
411}
412
413static INLINE void
414set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
415{
416	if (dst->type == P_RESULT) {
417		set_long(pc, e);
418		e->inst[1] |= 0x00000008;
419	}
420
421	alloc_reg(pc, dst);
422	e->inst[0] |= (dst->hw << 2);
423}
424
425static INLINE void
426set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
427{
428	float f = pc->immd_buf[imm->hw];
429	unsigned val = fui(imm->neg ? -f : f);
430
431	set_long(pc, e);
432	/*XXX: can't be predicated - bits overlap.. catch cases where both
433	 *     are required and avoid them. */
434	set_pred(pc, 0, 0, e);
435	set_pred_wr(pc, 0, 0, e);
436
437	e->inst[1] |= 0x00000002 | 0x00000001;
438	e->inst[0] |= (val & 0x3f) << 16;
439	e->inst[1] |= (val >> 6) << 2;
440}
441
442
443#define INTERP_LINEAR		0
444#define INTERP_FLAT			1
445#define INTERP_PERSPECTIVE	2
446#define INTERP_CENTROID		4
447
448/* interpolant index has been stored in dst->rhw */
449static void
450emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
451		unsigned mode)
452{
453	assert(dst->rhw != -1);
454	struct nv50_program_exec *e = exec(pc);
455
456	e->inst[0] |= 0x80000000;
457	set_dst(pc, dst, e);
458	e->inst[0] |= (dst->rhw << 16);
459
460	if (mode & INTERP_FLAT) {
461		e->inst[0] |= (1 << 8);
462	} else {
463		if (mode & INTERP_PERSPECTIVE) {
464			e->inst[0] |= (1 << 25);
465			alloc_reg(pc, iv);
466			e->inst[0] |= (iv->hw << 9);
467		}
468
469		if (mode & INTERP_CENTROID)
470			e->inst[0] |= (1 << 24);
471	}
472
473	emit(pc, e);
474}
475
476static void
477set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
478	 struct nv50_program_exec *e)
479{
480	set_long(pc, e);
481
482	e->param.index = src->hw;
483	e->param.shift = s;
484	e->param.mask = m << (s % 32);
485
486	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
487}
488
489static void
490emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
491{
492	struct nv50_program_exec *e = exec(pc);
493
494	e->inst[0] |= 0x10000000;
495
496	set_dst(pc, dst, e);
497
498	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
499		set_immd(pc, src, e);
500		/*XXX: 32-bit, but steals part of "half" reg space - need to
501		 *     catch and handle this case if/when we do half-regs
502		 */
503	} else
504	if (src->type == P_IMMD || src->type == P_CONST) {
505		set_long(pc, e);
506		set_data(pc, src, 0x7f, 9, e);
507		e->inst[1] |= 0x20000000; /* src0 const? */
508	} else {
509		if (src->type == P_ATTR) {
510			set_long(pc, e);
511			e->inst[1] |= 0x00200000;
512		}
513
514		alloc_reg(pc, src);
515		e->inst[0] |= (src->hw << 9);
516	}
517
518	if (is_long(e) && !is_immd(e)) {
519		e->inst[1] |= 0x04000000; /* 32-bit */
520		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
521		if (!(e->inst[1] & 0x20000000))
522			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
523	} else
524		e->inst[0] |= 0x00008000;
525
526	emit(pc, e);
527}
528
529static INLINE void
530emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
531{
532	struct nv50_reg *imm = alloc_immd(pc, f);
533	emit_mov(pc, dst, imm);
534	FREE(imm);
535}
536
537static boolean
538check_swap_src_0_1(struct nv50_pc *pc,
539		   struct nv50_reg **s0, struct nv50_reg **s1)
540{
541	struct nv50_reg *src0 = *s0, *src1 = *s1;
542
543	if (src0->type == P_CONST) {
544		if (src1->type != P_CONST) {
545			*s0 = src1;
546			*s1 = src0;
547			return TRUE;
548		}
549	} else
550	if (src1->type == P_ATTR) {
551		if (src0->type != P_ATTR) {
552			*s0 = src1;
553			*s1 = src0;
554			return TRUE;
555		}
556	}
557
558	return FALSE;
559}
560
561static void
562set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
563{
564	if (src->type == P_ATTR) {
565		set_long(pc, e);
566		e->inst[1] |= 0x00200000;
567	} else
568	if (src->type == P_CONST || src->type == P_IMMD) {
569		struct nv50_reg *temp = temp_temp(pc);
570
571		emit_mov(pc, temp, src);
572		src = temp;
573	}
574
575	alloc_reg(pc, src);
576	e->inst[0] |= (src->hw << 9);
577}
578
579static void
580set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
581{
582	if (src->type == P_ATTR) {
583		struct nv50_reg *temp = temp_temp(pc);
584
585		emit_mov(pc, temp, src);
586		src = temp;
587	} else
588	if (src->type == P_CONST || src->type == P_IMMD) {
589		assert(!(e->inst[0] & 0x00800000));
590		if (e->inst[0] & 0x01000000) {
591			struct nv50_reg *temp = temp_temp(pc);
592
593			emit_mov(pc, temp, src);
594			src = temp;
595		} else {
596			set_data(pc, src, 0x7f, 16, e);
597			e->inst[0] |= 0x00800000;
598		}
599	}
600
601	alloc_reg(pc, src);
602	e->inst[0] |= (src->hw << 16);
603}
604
605static void
606set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
607{
608	set_long(pc, e);
609
610	if (src->type == P_ATTR) {
611		struct nv50_reg *temp = temp_temp(pc);
612
613		emit_mov(pc, temp, src);
614		src = temp;
615	} else
616	if (src->type == P_CONST || src->type == P_IMMD) {
617		assert(!(e->inst[0] & 0x01000000));
618		if (e->inst[0] & 0x00800000) {
619			struct nv50_reg *temp = temp_temp(pc);
620
621			emit_mov(pc, temp, src);
622			src = temp;
623		} else {
624			set_data(pc, src, 0x7f, 32+14, e);
625			e->inst[0] |= 0x01000000;
626		}
627	}
628
629	alloc_reg(pc, src);
630	e->inst[1] |= (src->hw << 14);
631}
632
633static void
634emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
635	 struct nv50_reg *src1)
636{
637	struct nv50_program_exec *e = exec(pc);
638
639	e->inst[0] |= 0xc0000000;
640
641	if (!pc->allow32)
642		set_long(pc, e);
643
644	check_swap_src_0_1(pc, &src0, &src1);
645	set_dst(pc, dst, e);
646	set_src_0(pc, src0, e);
647	if (src1->type == P_IMMD && !is_long(e)) {
648		if (src0->neg)
649			e->inst[0] |= 0x00008000;
650		set_immd(pc, src1, e);
651	} else {
652		set_src_1(pc, src1, e);
653		if (src0->neg ^ src1->neg) {
654			if (is_long(e))
655				e->inst[1] |= 0x08000000;
656			else
657				e->inst[0] |= 0x00008000;
658		}
659	}
660
661	emit(pc, e);
662}
663
664static void
665emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
666	 struct nv50_reg *src0, struct nv50_reg *src1)
667{
668	struct nv50_program_exec *e = exec(pc);
669
670	e->inst[0] |= 0xb0000000;
671
672	check_swap_src_0_1(pc, &src0, &src1);
673
674	if (!pc->allow32 || src0->neg || src1->neg) {
675		set_long(pc, e);
676		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
677	}
678
679	set_dst(pc, dst, e);
680	set_src_0(pc, src0, e);
681	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
682		set_src_2(pc, src1, e);
683	else
684	if (src1->type == P_IMMD)
685		set_immd(pc, src1, e);
686	else
687		set_src_1(pc, src1, e);
688
689	emit(pc, e);
690}
691
692static void
693emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
694	    struct nv50_reg *src0, struct nv50_reg *src1)
695{
696	struct nv50_program_exec *e = exec(pc);
697
698	set_long(pc, e);
699	e->inst[0] |= 0xb0000000;
700	e->inst[1] |= (sub << 29);
701
702	check_swap_src_0_1(pc, &src0, &src1);
703	set_dst(pc, dst, e);
704	set_src_0(pc, src0, e);
705	set_src_1(pc, src1, e);
706
707	emit(pc, e);
708}
709
710static INLINE void
711emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
712	 struct nv50_reg *src1)
713{
714	src1->neg ^= 1;
715	emit_add(pc, dst, src0, src1);
716	src1->neg ^= 1;
717}
718
719static void
720emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
721	 struct nv50_reg *src1, struct nv50_reg *src2)
722{
723	struct nv50_program_exec *e = exec(pc);
724
725	e->inst[0] |= 0xe0000000;
726
727	check_swap_src_0_1(pc, &src0, &src1);
728	set_dst(pc, dst, e);
729	set_src_0(pc, src0, e);
730	set_src_1(pc, src1, e);
731	set_src_2(pc, src2, e);
732
733	if (src0->neg ^ src1->neg)
734		e->inst[1] |= 0x04000000;
735	if (src2->neg)
736		e->inst[1] |= 0x08000000;
737
738	emit(pc, e);
739}
740
741static INLINE void
742emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
743	 struct nv50_reg *src1, struct nv50_reg *src2)
744{
745	src2->neg ^= 1;
746	emit_mad(pc, dst, src0, src1, src2);
747	src2->neg ^= 1;
748}
749
750static void
751emit_flop(struct nv50_pc *pc, unsigned sub,
752	  struct nv50_reg *dst, struct nv50_reg *src)
753{
754	struct nv50_program_exec *e = exec(pc);
755
756	e->inst[0] |= 0x90000000;
757	if (sub) {
758		set_long(pc, e);
759		e->inst[1] |= (sub << 29);
760	}
761
762	set_dst(pc, dst, e);
763	set_src_0(pc, src, e);
764
765	emit(pc, e);
766}
767
768static void
769emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
770{
771	struct nv50_program_exec *e = exec(pc);
772
773	e->inst[0] |= 0xb0000000;
774
775	set_dst(pc, dst, e);
776	set_src_0(pc, src, e);
777	set_long(pc, e);
778	e->inst[1] |= (6 << 29) | 0x00004000;
779
780	emit(pc, e);
781}
782
783static void
784emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
785{
786	struct nv50_program_exec *e = exec(pc);
787
788	e->inst[0] |= 0xb0000000;
789
790	set_dst(pc, dst, e);
791	set_src_0(pc, src, e);
792	set_long(pc, e);
793	e->inst[1] |= (6 << 29);
794
795	emit(pc, e);
796}
797
798#define CVTOP_RN	0x01
799#define CVTOP_FLOOR	0x03
800#define CVTOP_CEIL	0x05
801#define CVTOP_TRUNC	0x07
802#define CVTOP_SAT	0x08
803#define CVTOP_ABS	0x10
804
805/* 0x04 == 32 bit */
806/* 0x40 == dst is float */
807/* 0x80 == src is float */
808#define CVT_F32_F32 0xc4
809#define CVT_F32_S32 0x44
810#define CVT_F32_U32 0x64
811#define CVT_S32_F32 0x8c
812#define CVT_S32_S32 0x0c
813#define CVT_F32_F32_ROP 0xcc
814
815static void
816emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
817	 int wp, unsigned cvn, unsigned fmt)
818{
819	struct nv50_program_exec *e;
820
821	e = exec(pc);
822	set_long(pc, e);
823
824	e->inst[0] |= 0xa0000000;
825	e->inst[1] |= 0x00004000;
826	e->inst[1] |= (cvn << 16);
827	e->inst[1] |= (fmt << 24);
828	set_src_0(pc, src, e);
829
830	if (wp >= 0)
831		set_pred_wr(pc, 1, wp, e);
832
833	if (dst)
834		set_dst(pc, dst, e);
835	else {
836		e->inst[0] |= 0x000001fc;
837		e->inst[1] |= 0x00000008;
838	}
839
840	emit(pc, e);
841}
842
843/* nv50 Condition codes:
844 *  0x1 = LT
845 *  0x2 = EQ
846 *  0x3 = LE
847 *  0x4 = GT
848 *  0x5 = NE
849 *  0x6 = GE
850 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
851 *  0x8 = unordered bit (allows NaN)
852 */
853static void
854emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
855	 struct nv50_reg *src0, struct nv50_reg *src1)
856{
857	struct nv50_program_exec *e = exec(pc);
858	struct nv50_reg *rdst;
859
860	assert(ccode < 16);
861	if (check_swap_src_0_1(pc, &src0, &src1))
862		ccode = ccode ^ 0x7;
863
864	rdst = dst;
865	if (dst && dst->type != P_TEMP)
866		dst = alloc_temp(pc, NULL);
867
868	/* set.u32 */
869	set_long(pc, e);
870	e->inst[0] |= 0xb0000000;
871	e->inst[1] |= 0x60000000 | (ccode << 14);
872
873	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
874	 * that doesn't seem to match what the hw actually does
875	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
876	 */
877
878	if (wp >= 0)
879		set_pred_wr(pc, 1, wp, e);
880	if (dst)
881		set_dst(pc, dst, e);
882	else {
883		e->inst[0] |= 0x000001fc;
884		e->inst[1] |= 0x00000008;
885	}
886
887	set_src_0(pc, src0, e);
888	set_src_1(pc, src1, e);
889
890	emit(pc, e);
891
892	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
893	if (rdst)
894		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
895	if (rdst && rdst != dst)
896		free_temp(pc, dst);
897}
898
899static INLINE unsigned
900map_tgsi_setop_cc(unsigned op)
901{
902	switch (op) {
903	case TGSI_OPCODE_SLT: return 0x1;
904	case TGSI_OPCODE_SGE: return 0x6;
905	case TGSI_OPCODE_SEQ: return 0x2;
906	case TGSI_OPCODE_SGT: return 0x4;
907	case TGSI_OPCODE_SLE: return 0x3;
908	case TGSI_OPCODE_SNE: return 0xd;
909	default:
910		assert(0);
911		return 0;
912	}
913}
914
915static INLINE void
916emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
917{
918	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
919}
920
921static void
922emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
923	 struct nv50_reg *v, struct nv50_reg *e)
924{
925	struct nv50_reg *temp = alloc_temp(pc, NULL);
926
927	emit_flop(pc, 3, temp, v);
928	emit_mul(pc, temp, temp, e);
929	emit_preex2(pc, temp, temp);
930	emit_flop(pc, 6, dst, temp);
931
932	free_temp(pc, temp);
933}
934
935static INLINE void
936emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
937{
938	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
939}
940
941static INLINE void
942emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
943{
944	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
945}
946
947static void
948emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
949	 struct nv50_reg **src)
950{
951	struct nv50_reg *one = alloc_immd(pc, 1.0);
952	struct nv50_reg *zero = alloc_immd(pc, 0.0);
953	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
954	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
955	struct nv50_reg *tmp[4];
956	boolean allow32 = pc->allow32;
957
958	pc->allow32 = FALSE;
959
960	if (mask & (3 << 1)) {
961		tmp[0] = alloc_temp(pc, NULL);
962		emit_minmax(pc, 4, tmp[0], src[0], zero);
963	}
964
965	if (mask & (1 << 2)) {
966		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
967
968		tmp[1] = temp_temp(pc);
969		emit_minmax(pc, 4, tmp[1], src[1], zero);
970
971		tmp[3] = temp_temp(pc);
972		emit_minmax(pc, 4, tmp[3], src[3], neg128);
973		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
974
975		emit_pow(pc, dst[2], tmp[1], tmp[3]);
976		emit_mov(pc, dst[2], zero);
977		set_pred(pc, 3, 0, pc->p->exec_tail);
978	}
979
980	if (mask & (1 << 1))
981		assimilate_temp(pc, dst[1], tmp[0]);
982	else
983	if (mask & (1 << 2))
984		free_temp(pc, tmp[0]);
985
986	pc->allow32 = allow32;
987
988	/* do this last, in case src[i,j] == dst[0,3] */
989	if (mask & (1 << 0))
990		emit_mov(pc, dst[0], one);
991
992	if (mask & (1 << 3))
993		emit_mov(pc, dst[3], one);
994
995	FREE(pos128);
996	FREE(neg128);
997	FREE(zero);
998	FREE(one);
999}
1000
1001static void
1002emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1003{
1004	struct nv50_program_exec *e = exec(pc);
1005
1006	set_long(pc, e);
1007	e->inst[0] |= 0xa0000000; /* delta */
1008	e->inst[1] |= (7 << 29); /* delta */
1009	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
1010	e->inst[1] |= (1 << 14); /* src .f32 */
1011	set_dst(pc, dst, e);
1012	set_src_0(pc, src, e);
1013
1014	emit(pc, e);
1015}
1016
1017static void
1018emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1019{
1020	struct nv50_program_exec *e;
1021	const int r_pred = 1;
1022
1023	/* Sets predicate reg ? */
1024	e = exec(pc);
1025	e->inst[0] = 0xa00001fd;
1026	e->inst[1] = 0xc4014788;
1027	set_src_0(pc, src, e);
1028	set_pred_wr(pc, 1, r_pred, e);
1029	if (src->neg)
1030		e->inst[1] |= 0x20000000;
1031	emit(pc, e);
1032
1033	/* This is probably KILP */
1034	e = exec(pc);
1035	e->inst[0] = 0x000001fe;
1036	set_long(pc, e);
1037	set_pred(pc, 1 /* LT? */, r_pred, e);
1038	emit(pc, e);
1039}
1040
1041static void
1042emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1043	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1044{
1045	struct nv50_reg *temp, *t[4];
1046	struct nv50_program_exec *e;
1047
1048	unsigned c, mode, dim;
1049
1050	switch (type) {
1051	case TGSI_TEXTURE_1D:
1052		dim = 1;
1053		break;
1054	case TGSI_TEXTURE_UNKNOWN:
1055	case TGSI_TEXTURE_2D:
1056	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1057	case TGSI_TEXTURE_RECT:
1058		dim = 2;
1059		break;
1060	case TGSI_TEXTURE_3D:
1061	case TGSI_TEXTURE_CUBE:
1062	case TGSI_TEXTURE_SHADOW2D:
1063	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1064		dim = 3;
1065		break;
1066	default:
1067		assert(0);
1068		break;
1069	}
1070
1071	/* some cards need t[0]'s hw index to be a multiple of 4 */
1072	alloc_temp4(pc, t, 0);
1073
1074	if (proj) {
1075		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1076			mode = pc->interp_mode[src[0]->index];
1077
1078			t[3]->rhw = src[3]->rhw;
1079			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1080			emit_flop(pc, 0, t[3], t[3]);
1081
1082			for (c = 0; c < dim; c++) {
1083				t[c]->rhw = src[c]->rhw;
1084				emit_interp(pc, t[c], t[3],
1085					    (mode | INTERP_PERSPECTIVE));
1086			}
1087		} else {
1088			emit_flop(pc, 0, t[3], src[3]);
1089			for (c = 0; c < dim; c++)
1090				emit_mul(pc, t[c], src[c], t[3]);
1091
1092			/* XXX: for some reason the blob sometimes uses MAD:
1093			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1094			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1095			 */
1096		}
1097	} else {
1098		if (type == TGSI_TEXTURE_CUBE) {
1099			temp = temp_temp(pc);
1100			emit_minmax(pc, 4, temp, src[0], src[1]);
1101			emit_minmax(pc, 4, temp, temp, src[2]);
1102			emit_flop(pc, 0, temp, temp);
1103			for (c = 0; c < 3; c++)
1104				emit_mul(pc, t[c], src[c], temp);
1105		} else {
1106			for (c = 0; c < dim; c++)
1107				emit_mov(pc, t[c], src[c]);
1108		}
1109	}
1110
1111	e = exec(pc);
1112	set_long(pc, e);
1113	e->inst[0] |= 0xf0000000;
1114	e->inst[1] |= 0x00000004;
1115	set_dst(pc, t[0], e);
1116	e->inst[0] |= (unit << 9);
1117
1118	if (dim == 2)
1119		e->inst[0] |= 0x00400000;
1120	else
1121	if (dim == 3)
1122		e->inst[0] |= 0x00800000;
1123
1124	e->inst[0] |= (mask & 0x3) << 25;
1125	e->inst[1] |= (mask & 0xc) << 12;
1126
1127	emit(pc, e);
1128
1129#if 1
1130	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1131	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1132	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1133	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1134
1135	free_temp4(pc, t);
1136#else
1137	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1138	 * the texture coordinates, not the fetched values: latency ? */
1139
1140	for (c = 0; c < 4; c++) {
1141		if (mask & (1 << c))
1142			assimilate_temp(pc, dst[c], t[c]);
1143		else
1144			free_temp(pc, t[c]);
1145	}
1146#endif
1147}
1148
1149static void
1150convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1151{
1152	unsigned q = 0, m = ~0;
1153
1154	assert(!is_long(e));
1155
1156	switch (e->inst[0] >> 28) {
1157	case 0x1:
1158		/* MOV */
1159		q = 0x0403c000;
1160		m = 0xffff7fff;
1161		break;
1162	case 0x8:
1163		/* INTERP (move centroid, perspective and flat bits) */
1164		m = ~0x03000100;
1165		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1166		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1167		break;
1168	case 0x9:
1169		/* RCP */
1170		break;
1171	case 0xB:
1172		/* ADD */
1173		m = ~(127 << 16);
1174		q = ((e->inst[0] & (~m)) >> 2);
1175		break;
1176	case 0xC:
1177		/* MUL */
1178		m = ~0x00008000;
1179		q = ((e->inst[0] & (~m)) << 12);
1180		break;
1181	case 0xE:
1182		/* MAD (if src2 == dst) */
1183		q = ((e->inst[0] & 0x1fc) << 12);
1184		break;
1185	default:
1186		assert(0);
1187		break;
1188	}
1189
1190	set_long(pc, e);
1191	pc->p->exec_size++;
1192
1193	e->inst[0] &= m;
1194	e->inst[1] |= q;
1195}
1196
1197static boolean
1198negate_supported(const struct tgsi_full_instruction *insn, int i)
1199{
1200	switch (insn->Instruction.Opcode) {
1201	case TGSI_OPCODE_DP3:
1202	case TGSI_OPCODE_DP4:
1203	case TGSI_OPCODE_MUL:
1204	case TGSI_OPCODE_KIL:
1205	case TGSI_OPCODE_ADD:
1206	case TGSI_OPCODE_SUB:
1207	case TGSI_OPCODE_MAD:
1208		return TRUE;
1209	case TGSI_OPCODE_POW:
1210		return (i == 1) ? TRUE : FALSE;
1211	default:
1212		return FALSE;
1213	}
1214}
1215
1216/* Return a read mask for source registers deduced from opcode & write mask. */
1217static unsigned
1218nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1219{
1220	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1221
1222	switch (insn->Instruction.Opcode) {
1223	case TGSI_OPCODE_COS:
1224	case TGSI_OPCODE_SIN:
1225		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1226	case TGSI_OPCODE_DP3:
1227		return 0x7;
1228	case TGSI_OPCODE_DP4:
1229	case TGSI_OPCODE_DPH:
1230	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1231		return 0xf;
1232	case TGSI_OPCODE_DST:
1233		return mask & (c ? 0xa : 0x6);
1234	case TGSI_OPCODE_EX2:
1235	case TGSI_OPCODE_LG2:
1236	case TGSI_OPCODE_POW:
1237	case TGSI_OPCODE_RCP:
1238	case TGSI_OPCODE_RSQ:
1239	case TGSI_OPCODE_SCS:
1240		return 0x1;
1241	case TGSI_OPCODE_LIT:
1242		return 0xb;
1243	case TGSI_OPCODE_TEX:
1244	case TGSI_OPCODE_TXP:
1245	{
1246		const struct tgsi_instruction_ext_texture *tex;
1247
1248		assert(insn->Instruction.Extended);
1249		tex = &insn->InstructionExtTexture;
1250
1251		mask = 0x7;
1252		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1253			mask |= 0x8;
1254
1255		switch (tex->Texture) {
1256		case TGSI_TEXTURE_1D:
1257			mask &= 0x9;
1258			break;
1259		case TGSI_TEXTURE_2D:
1260			mask &= 0xb;
1261			break;
1262		default:
1263			break;
1264		}
1265	}
1266		return mask;
1267	case TGSI_OPCODE_XPD:
1268		x = 0;
1269		if (mask & 1) x |= 0x6;
1270		if (mask & 2) x |= 0x5;
1271		if (mask & 4) x |= 0x3;
1272		return x;
1273	default:
1274		break;
1275	}
1276
1277	return mask;
1278}
1279
1280static struct nv50_reg *
1281tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1282{
1283	switch (dst->DstRegister.File) {
1284	case TGSI_FILE_TEMPORARY:
1285		return &pc->temp[dst->DstRegister.Index * 4 + c];
1286	case TGSI_FILE_OUTPUT:
1287		return &pc->result[dst->DstRegister.Index * 4 + c];
1288	case TGSI_FILE_NULL:
1289		return NULL;
1290	default:
1291		break;
1292	}
1293
1294	return NULL;
1295}
1296
1297static struct nv50_reg *
1298tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1299	 boolean neg)
1300{
1301	struct nv50_reg *r = NULL;
1302	struct nv50_reg *temp;
1303	unsigned sgn, c;
1304
1305	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1306
1307	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1308	switch (c) {
1309	case TGSI_EXTSWIZZLE_X:
1310	case TGSI_EXTSWIZZLE_Y:
1311	case TGSI_EXTSWIZZLE_Z:
1312	case TGSI_EXTSWIZZLE_W:
1313		switch (src->SrcRegister.File) {
1314		case TGSI_FILE_INPUT:
1315			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1316			break;
1317		case TGSI_FILE_TEMPORARY:
1318			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1319			break;
1320		case TGSI_FILE_CONSTANT:
1321			r = &pc->param[src->SrcRegister.Index * 4 + c];
1322			break;
1323		case TGSI_FILE_IMMEDIATE:
1324			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1325			break;
1326		case TGSI_FILE_SAMPLER:
1327			break;
1328		default:
1329			assert(0);
1330			break;
1331		}
1332		break;
1333	case TGSI_EXTSWIZZLE_ZERO:
1334		r = alloc_immd(pc, 0.0);
1335		return r;
1336	case TGSI_EXTSWIZZLE_ONE:
1337		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1338			return alloc_immd(pc, -1.0);
1339		return alloc_immd(pc, 1.0);
1340	default:
1341		assert(0);
1342		break;
1343	}
1344
1345	switch (sgn) {
1346	case TGSI_UTIL_SIGN_KEEP:
1347		break;
1348	case TGSI_UTIL_SIGN_CLEAR:
1349		temp = temp_temp(pc);
1350		emit_abs(pc, temp, r);
1351		r = temp;
1352		break;
1353	case TGSI_UTIL_SIGN_TOGGLE:
1354		if (neg)
1355			r->neg = 1;
1356		else {
1357			temp = temp_temp(pc);
1358			emit_neg(pc, temp, r);
1359			r = temp;
1360		}
1361		break;
1362	case TGSI_UTIL_SIGN_SET:
1363		temp = temp_temp(pc);
1364		emit_abs(pc, temp, r);
1365		if (neg)
1366			temp->neg = 1;
1367		else
1368			emit_neg(pc, temp, temp);
1369		r = temp;
1370		break;
1371	default:
1372		assert(0);
1373		break;
1374	}
1375
1376	return r;
1377}
1378
1379/* return TRUE for ops that produce only a single result */
1380static boolean
1381is_scalar_op(unsigned op)
1382{
1383	switch (op) {
1384	case TGSI_OPCODE_COS:
1385	case TGSI_OPCODE_DP2:
1386	case TGSI_OPCODE_DP3:
1387	case TGSI_OPCODE_DP4:
1388	case TGSI_OPCODE_DPH:
1389	case TGSI_OPCODE_EX2:
1390	case TGSI_OPCODE_LG2:
1391	case TGSI_OPCODE_POW:
1392	case TGSI_OPCODE_RCP:
1393	case TGSI_OPCODE_RSQ:
1394	case TGSI_OPCODE_SIN:
1395		/*
1396	case TGSI_OPCODE_KIL:
1397	case TGSI_OPCODE_LIT:
1398	case TGSI_OPCODE_SCS:
1399		*/
1400		return TRUE;
1401	default:
1402		return FALSE;
1403	}
1404}
1405
1406/* Returns a bitmask indicating which dst components depend
1407 * on source s, component c (reverse of nv50_tgsi_src_mask).
1408 */
1409static unsigned
1410nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1411{
1412	if (is_scalar_op(op))
1413		return 0x1;
1414
1415	switch (op) {
1416	case TGSI_OPCODE_DST:
1417		return (1 << c) & (s ? 0xa : 0x6);
1418	case TGSI_OPCODE_XPD:
1419		switch (c) {
1420		case 0: return 0x6;
1421		case 1: return 0x5;
1422		case 2: return 0x3;
1423		case 3: return 0x0;
1424		default:
1425			assert(0);
1426			return 0x0;
1427		}
1428	case TGSI_OPCODE_LIT:
1429	case TGSI_OPCODE_SCS:
1430	case TGSI_OPCODE_TEX:
1431	case TGSI_OPCODE_TXP:
1432		/* these take care of dangerous swizzles themselves */
1433		return 0x0;
1434	case TGSI_OPCODE_IF:
1435	case TGSI_OPCODE_KIL:
1436		/* don't call this function for these ops */
1437		assert(0);
1438		return 0;
1439	default:
1440		/* linear vector instruction */
1441		return (1 << c);
1442	}
1443}
1444
1445static boolean
1446nv50_program_tx_insn(struct nv50_pc *pc,
1447		     const struct tgsi_full_instruction *inst)
1448{
1449	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1450	unsigned mask, sat, unit;
1451	int i, c;
1452
1453	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1454	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1455
1456	memset(src, 0, sizeof(src));
1457
1458	for (c = 0; c < 4; c++) {
1459		if ((mask & (1 << c)) && !pc->r_dst[c])
1460			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1461		else
1462			dst[c] = pc->r_dst[c];
1463		rdst[c] = dst[c];
1464	}
1465
1466	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1467		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1468		unsigned src_mask;
1469		boolean neg_supp;
1470
1471		src_mask = nv50_tgsi_src_mask(inst, i);
1472		neg_supp = negate_supported(inst, i);
1473
1474		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1475			unit = fs->SrcRegister.Index;
1476
1477		for (c = 0; c < 4; c++)
1478			if (src_mask & (1 << c))
1479				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1480	}
1481
1482	brdc = temp = pc->r_brdc;
1483	if (brdc && brdc->type != P_TEMP) {
1484		temp = temp_temp(pc);
1485		if (sat)
1486			brdc = temp;
1487	} else
1488	if (sat) {
1489		for (c = 0; c < 4; c++) {
1490			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1491				continue;
1492			rdst[c] = dst[c];
1493			dst[c] = temp_temp(pc);
1494		}
1495	}
1496
1497	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1498
1499	switch (inst->Instruction.Opcode) {
1500	case TGSI_OPCODE_ABS:
1501		for (c = 0; c < 4; c++) {
1502			if (!(mask & (1 << c)))
1503				continue;
1504			emit_abs(pc, dst[c], src[0][c]);
1505		}
1506		break;
1507	case TGSI_OPCODE_ADD:
1508		for (c = 0; c < 4; c++) {
1509			if (!(mask & (1 << c)))
1510				continue;
1511			emit_add(pc, dst[c], src[0][c], src[1][c]);
1512		}
1513		break;
1514	case TGSI_OPCODE_CEIL:
1515		for (c = 0; c < 4; c++) {
1516			if (!(mask & (1 << c)))
1517				continue;
1518			emit_cvt(pc, dst[c], src[0][c], -1,
1519				 CVTOP_CEIL, CVT_F32_F32);
1520		}
1521		break;
1522	case TGSI_OPCODE_COS:
1523		if (mask & 8) {
1524			emit_precossin(pc, temp, src[0][3]);
1525			emit_flop(pc, 5, dst[3], temp);
1526			if (!(mask &= 7))
1527				break;
1528			if (temp == dst[3])
1529				temp = brdc = temp_temp(pc);
1530		}
1531		emit_precossin(pc, temp, src[0][0]);
1532		emit_flop(pc, 5, brdc, temp);
1533		break;
1534	case TGSI_OPCODE_DP3:
1535		emit_mul(pc, temp, src[0][0], src[1][0]);
1536		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1537		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1538		break;
1539	case TGSI_OPCODE_DP4:
1540		emit_mul(pc, temp, src[0][0], src[1][0]);
1541		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1542		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1543		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1544		break;
1545	case TGSI_OPCODE_DPH:
1546		emit_mul(pc, temp, src[0][0], src[1][0]);
1547		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1548		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1549		emit_add(pc, brdc, src[1][3], temp);
1550		break;
1551	case TGSI_OPCODE_DST:
1552		if (mask & (1 << 1))
1553			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1554		if (mask & (1 << 2))
1555			emit_mov(pc, dst[2], src[0][2]);
1556		if (mask & (1 << 3))
1557			emit_mov(pc, dst[3], src[1][3]);
1558		if (mask & (1 << 0))
1559			emit_mov_immdval(pc, dst[0], 1.0f);
1560		break;
1561	case TGSI_OPCODE_EX2:
1562		emit_preex2(pc, temp, src[0][0]);
1563		emit_flop(pc, 6, brdc, temp);
1564		break;
1565	case TGSI_OPCODE_FLR:
1566		for (c = 0; c < 4; c++) {
1567			if (!(mask & (1 << c)))
1568				continue;
1569			emit_flr(pc, dst[c], src[0][c]);
1570		}
1571		break;
1572	case TGSI_OPCODE_FRC:
1573		temp = temp_temp(pc);
1574		for (c = 0; c < 4; c++) {
1575			if (!(mask & (1 << c)))
1576				continue;
1577			emit_flr(pc, temp, src[0][c]);
1578			emit_sub(pc, dst[c], src[0][c], temp);
1579		}
1580		break;
1581	case TGSI_OPCODE_KIL:
1582		emit_kil(pc, src[0][0]);
1583		emit_kil(pc, src[0][1]);
1584		emit_kil(pc, src[0][2]);
1585		emit_kil(pc, src[0][3]);
1586		break;
1587	case TGSI_OPCODE_LIT:
1588		emit_lit(pc, &dst[0], mask, &src[0][0]);
1589		break;
1590	case TGSI_OPCODE_LG2:
1591		emit_flop(pc, 3, brdc, src[0][0]);
1592		break;
1593	case TGSI_OPCODE_LRP:
1594		temp = temp_temp(pc);
1595		for (c = 0; c < 4; c++) {
1596			if (!(mask & (1 << c)))
1597				continue;
1598			emit_sub(pc, temp, src[1][c], src[2][c]);
1599			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1600		}
1601		break;
1602	case TGSI_OPCODE_MAD:
1603		for (c = 0; c < 4; c++) {
1604			if (!(mask & (1 << c)))
1605				continue;
1606			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1607		}
1608		break;
1609	case TGSI_OPCODE_MAX:
1610		for (c = 0; c < 4; c++) {
1611			if (!(mask & (1 << c)))
1612				continue;
1613			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1614		}
1615		break;
1616	case TGSI_OPCODE_MIN:
1617		for (c = 0; c < 4; c++) {
1618			if (!(mask & (1 << c)))
1619				continue;
1620			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1621		}
1622		break;
1623	case TGSI_OPCODE_MOV:
1624	case TGSI_OPCODE_SWZ:
1625		for (c = 0; c < 4; c++) {
1626			if (!(mask & (1 << c)))
1627				continue;
1628			emit_mov(pc, dst[c], src[0][c]);
1629		}
1630		break;
1631	case TGSI_OPCODE_MUL:
1632		for (c = 0; c < 4; c++) {
1633			if (!(mask & (1 << c)))
1634				continue;
1635			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1636		}
1637		break;
1638	case TGSI_OPCODE_POW:
1639		emit_pow(pc, brdc, src[0][0], src[1][0]);
1640		break;
1641	case TGSI_OPCODE_RCP:
1642		emit_flop(pc, 0, brdc, src[0][0]);
1643		break;
1644	case TGSI_OPCODE_RSQ:
1645		emit_flop(pc, 2, brdc, src[0][0]);
1646		break;
1647	case TGSI_OPCODE_SCS:
1648		temp = temp_temp(pc);
1649		if (mask & 3)
1650			emit_precossin(pc, temp, src[0][0]);
1651		if (mask & (1 << 0))
1652			emit_flop(pc, 5, dst[0], temp);
1653		if (mask & (1 << 1))
1654			emit_flop(pc, 4, dst[1], temp);
1655		if (mask & (1 << 2))
1656			emit_mov_immdval(pc, dst[2], 0.0);
1657		if (mask & (1 << 3))
1658			emit_mov_immdval(pc, dst[3], 1.0);
1659		break;
1660	case TGSI_OPCODE_SIN:
1661		if (mask & 8) {
1662			emit_precossin(pc, temp, src[0][3]);
1663			emit_flop(pc, 4, dst[3], temp);
1664			if (!(mask &= 7))
1665				break;
1666			if (temp == dst[3])
1667				temp = brdc = temp_temp(pc);
1668		}
1669		emit_precossin(pc, temp, src[0][0]);
1670		emit_flop(pc, 4, brdc, temp);
1671		break;
1672	case TGSI_OPCODE_SLT:
1673	case TGSI_OPCODE_SGE:
1674	case TGSI_OPCODE_SEQ:
1675	case TGSI_OPCODE_SGT:
1676	case TGSI_OPCODE_SLE:
1677	case TGSI_OPCODE_SNE:
1678		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1679		for (c = 0; c < 4; c++) {
1680			if (!(mask & (1 << c)))
1681				continue;
1682			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1683		}
1684		break;
1685	case TGSI_OPCODE_SUB:
1686		for (c = 0; c < 4; c++) {
1687			if (!(mask & (1 << c)))
1688				continue;
1689			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1690		}
1691		break;
1692	case TGSI_OPCODE_TEX:
1693		emit_tex(pc, dst, mask, src[0], unit,
1694			 inst->InstructionExtTexture.Texture, FALSE);
1695		break;
1696	case TGSI_OPCODE_TXP:
1697		emit_tex(pc, dst, mask, src[0], unit,
1698			 inst->InstructionExtTexture.Texture, TRUE);
1699		break;
1700	case TGSI_OPCODE_TRUNC:
1701		for (c = 0; c < 4; c++) {
1702			if (!(mask & (1 << c)))
1703				continue;
1704			emit_cvt(pc, dst[c], src[0][c], -1,
1705				 CVTOP_TRUNC, CVT_F32_F32);
1706		}
1707		break;
1708	case TGSI_OPCODE_XPD:
1709		temp = temp_temp(pc);
1710		if (mask & (1 << 0)) {
1711			emit_mul(pc, temp, src[0][2], src[1][1]);
1712			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1713		}
1714		if (mask & (1 << 1)) {
1715			emit_mul(pc, temp, src[0][0], src[1][2]);
1716			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1717		}
1718		if (mask & (1 << 2)) {
1719			emit_mul(pc, temp, src[0][1], src[1][0]);
1720			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1721		}
1722		if (mask & (1 << 3))
1723			emit_mov_immdval(pc, dst[3], 1.0);
1724		break;
1725	case TGSI_OPCODE_END:
1726		break;
1727	default:
1728		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1729		return FALSE;
1730	}
1731
1732	if (brdc) {
1733		if (sat)
1734			emit_sat(pc, brdc, brdc);
1735		for (c = 0; c < 4; c++)
1736			if ((mask & (1 << c)) && dst[c] != brdc)
1737				emit_mov(pc, dst[c], brdc);
1738	} else
1739	if (sat) {
1740		for (c = 0; c < 4; c++) {
1741			if (!(mask & (1 << c)))
1742				continue;
1743			/* in this case we saturate later */
1744			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1745				continue;
1746			emit_sat(pc, rdst[c], dst[c]);
1747		}
1748	}
1749
1750	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1751		for (c = 0; c < 4; c++) {
1752			if (!src[i][c])
1753				continue;
1754			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1755				FREE(src[i][c]);
1756		}
1757	}
1758
1759	kill_temp_temp(pc);
1760	return TRUE;
1761}
1762
1763static void
1764prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
1765{
1766	struct nv50_reg *reg = NULL;
1767	const struct tgsi_full_src_register *src;
1768	const struct tgsi_dst_register *dst;
1769	unsigned i, c, k, mask;
1770
1771	dst = &insn->FullDstRegisters[0].DstRegister;
1772	mask = dst->WriteMask;
1773
1774        if (dst->File == TGSI_FILE_TEMPORARY)
1775                reg = pc->temp;
1776        else
1777        if (dst->File == TGSI_FILE_OUTPUT)
1778                reg = pc->result;
1779
1780	if (reg) {
1781		for (c = 0; c < 4; c++) {
1782			if (!(mask & (1 << c)))
1783				continue;
1784			reg[dst->Index * 4 + c].acc = pc->insn_nr;
1785		}
1786	}
1787
1788	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1789		src = &insn->FullSrcRegisters[i];
1790
1791		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
1792			reg = pc->temp;
1793		else
1794		if (src->SrcRegister.File == TGSI_FILE_INPUT)
1795			reg = pc->attr;
1796		else
1797			continue;
1798
1799		mask = nv50_tgsi_src_mask(insn, i);
1800
1801		for (c = 0; c < 4; c++) {
1802			if (!(mask & (1 << c)))
1803				continue;
1804			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1805
1806			if (k > TGSI_EXTSWIZZLE_W)
1807				continue;
1808
1809			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
1810		}
1811	}
1812}
1813
1814/* Returns a bitmask indicating which dst components need to be
1815 * written to temporaries first to avoid 'corrupting' sources.
1816 *
1817 * m[i]   (out) indicate component to write in the i-th position
1818 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1819 */
1820static unsigned
1821nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1822{
1823	unsigned i, c, x, unsafe;
1824
1825	for (c = 0; c < 4; c++)
1826		m[c] = c;
1827
1828	/* Swap as long as a dst component written earlier is depended on
1829	 * by one written later, but the next one isn't depended on by it.
1830	 */
1831	for (c = 0; c < 3; c++) {
1832		if (rdep[m[c + 1]] & (1 << m[c]))
1833			continue; /* if next one is depended on by us */
1834		for (i = c + 1; i < 4; i++)
1835			/* if we are depended on by a later one */
1836			if (rdep[m[c]] & (1 << m[i]))
1837				break;
1838		if (i == 4)
1839			continue;
1840		/* now, swap */
1841		x = m[c];
1842		m[c] = m[c + 1];
1843		m[c + 1] = x;
1844
1845		/* restart */
1846		c = 0;
1847	}
1848
1849	/* mark dependencies that could not be resolved by reordering */
1850	for (i = 0; i < 3; ++i)
1851		for (c = i + 1; c < 4; ++c)
1852			if (rdep[m[i]] & (1 << m[c]))
1853				unsafe |= (1 << i);
1854
1855	/* NOTE: $unsafe is with respect to order, not component */
1856	return unsafe;
1857}
1858
1859/* Select a suitable dst register for broadcasting scalar results,
1860 * or return NULL if we have to allocate an extra TEMP.
1861 *
1862 * If e.g. only 1 component is written, we may also emit the final
1863 * result to a write-only register.
1864 */
1865static struct nv50_reg *
1866tgsi_broadcast_dst(struct nv50_pc *pc,
1867		   const struct tgsi_full_dst_register *fd, unsigned mask)
1868{
1869	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1870		int c = ffs(~mask & fd->DstRegister.WriteMask);
1871		if (c)
1872			return tgsi_dst(pc, c - 1, fd);
1873	} else {
1874		int c = ffs(fd->DstRegister.WriteMask) - 1;
1875		if ((1 << c) == fd->DstRegister.WriteMask)
1876			return tgsi_dst(pc, c, fd);
1877	}
1878
1879	return NULL;
1880}
1881
1882/* Scan source swizzles and return a bitmask indicating dst regs that
1883 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1884 */
1885static unsigned
1886nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1887		       unsigned rdep[4])
1888{
1889	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1890	const struct tgsi_full_src_register *fs;
1891	unsigned i, deqs = 0;
1892
1893	for (i = 0; i < 4; ++i)
1894		rdep[i] = 0;
1895
1896	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1897		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1898		boolean neg_supp = negate_supported(insn, i);
1899
1900		fs = &insn->FullSrcRegisters[i];
1901		if (fs->SrcRegister.File != fd->DstRegister.File ||
1902		    fs->SrcRegister.Index != fd->DstRegister.Index)
1903			continue;
1904
1905		for (chn = 0; chn < 4; ++chn) {
1906			unsigned s, c;
1907
1908			if (!(mask & (1 << chn))) /* src is not read */
1909				continue;
1910			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1911			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1912
1913			if (c > TGSI_EXTSWIZZLE_W ||
1914			    !(fd->DstRegister.WriteMask & (1 << c)))
1915				continue;
1916
1917			/* no danger if src is copied to TEMP first */
1918			if ((s != TGSI_UTIL_SIGN_KEEP) &&
1919			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1920				continue;
1921
1922			rdep[c] |= nv50_tgsi_dst_revdep(
1923				insn->Instruction.Opcode, i, chn);
1924			deqs |= (1 << c);
1925		}
1926	}
1927
1928	return deqs;
1929}
1930
1931static boolean
1932nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1933{
1934	struct tgsi_full_instruction insn = tok->FullInstruction;
1935	const struct tgsi_full_dst_register *fd;
1936	unsigned i, deqs, rdep[4], m[4];
1937
1938	fd = &tok->FullInstruction.FullDstRegisters[0];
1939	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
1940
1941	if (is_scalar_op(insn.Instruction.Opcode)) {
1942		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
1943		if (!pc->r_brdc)
1944			pc->r_brdc = temp_temp(pc);
1945		return nv50_program_tx_insn(pc, &insn);
1946	}
1947	pc->r_brdc = NULL;
1948
1949	if (!deqs)
1950		return nv50_program_tx_insn(pc, &insn);
1951
1952	deqs = nv50_revdep_reorder(m, rdep);
1953
1954	for (i = 0; i < 4; ++i) {
1955		assert(pc->r_dst[m[i]] == NULL);
1956
1957		insn.FullDstRegisters[0].DstRegister.WriteMask =
1958			fd->DstRegister.WriteMask & (1 << m[i]);
1959
1960		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
1961			continue;
1962
1963		if (deqs & (1 << i))
1964			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
1965
1966		if (!nv50_program_tx_insn(pc, &insn))
1967			return FALSE;
1968	}
1969
1970	for (i = 0; i < 4; i++) {
1971		struct nv50_reg *reg = pc->r_dst[i];
1972		if (!reg)
1973			continue;
1974		pc->r_dst[i] = NULL;
1975
1976		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
1977			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
1978		else
1979			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
1980		free_temp(pc, reg);
1981	}
1982
1983	return TRUE;
1984}
1985
1986static void
1987load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
1988{
1989	struct nv50_reg *iv, **ppiv;
1990	unsigned mode = pc->interp_mode[reg->index];
1991
1992	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
1993	iv = *ppiv;
1994
1995	if ((mode & INTERP_PERSPECTIVE) && !iv) {
1996		iv = *ppiv = alloc_temp(pc, NULL);
1997		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
1998
1999		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2000		emit_flop(pc, 0, iv, iv);
2001
2002		/* XXX: when loading interpolants dynamically, move these
2003		 * to the program head, or make sure it can't be skipped.
2004		 */
2005	}
2006
2007	emit_interp(pc, reg, iv, mode);
2008}
2009
2010static boolean
2011nv50_program_tx_prep(struct nv50_pc *pc)
2012{
2013	struct tgsi_parse_context tp;
2014	struct nv50_program *p = pc->p;
2015	boolean ret = FALSE;
2016	unsigned i, c, flat_nr = 0;
2017
2018	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2019	while (!tgsi_parse_end_of_tokens(&tp)) {
2020		const union tgsi_full_token *tok = &tp.FullToken;
2021
2022		tgsi_parse_token(&tp);
2023		switch (tok->Token.Type) {
2024		case TGSI_TOKEN_TYPE_IMMEDIATE:
2025		{
2026			const struct tgsi_full_immediate *imm =
2027				&tp.FullToken.FullImmediate;
2028
2029			ctor_immd(pc, imm->u[0].Float,
2030				      imm->u[1].Float,
2031				      imm->u[2].Float,
2032				      imm->u[3].Float);
2033		}
2034			break;
2035		case TGSI_TOKEN_TYPE_DECLARATION:
2036		{
2037			const struct tgsi_full_declaration *d;
2038			unsigned si, last, first, mode;
2039
2040			d = &tp.FullToken.FullDeclaration;
2041			first = d->DeclarationRange.First;
2042			last = d->DeclarationRange.Last;
2043
2044			switch (d->Declaration.File) {
2045			case TGSI_FILE_TEMPORARY:
2046				break;
2047			case TGSI_FILE_OUTPUT:
2048				if (!d->Declaration.Semantic ||
2049				    p->type == PIPE_SHADER_FRAGMENT)
2050					break;
2051
2052				si = d->Semantic.SemanticIndex;
2053				switch (d->Semantic.SemanticName) {
2054					/*
2055				case TGSI_SEMANTIC_CLIP_DISTANCE:
2056					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2057					break;
2058					*/
2059				default:
2060					break;
2061				}
2062				break;
2063			case TGSI_FILE_INPUT:
2064			{
2065				if (p->type != PIPE_SHADER_FRAGMENT)
2066					break;
2067
2068				switch (d->Declaration.Interpolate) {
2069				case TGSI_INTERPOLATE_CONSTANT:
2070					mode = INTERP_FLAT;
2071					flat_nr++;
2072					break;
2073				case TGSI_INTERPOLATE_PERSPECTIVE:
2074					mode = INTERP_PERSPECTIVE;
2075					p->cfg.regs[1] |= 0x08 << 24;
2076					break;
2077				default:
2078					mode = INTERP_LINEAR;
2079					break;
2080				}
2081				if (d->Declaration.Centroid)
2082					mode |= INTERP_CENTROID;
2083
2084				assert(last < 32);
2085				for (i = first; i <= last; i++)
2086					pc->interp_mode[i] = mode;
2087			}
2088				break;
2089			case TGSI_FILE_CONSTANT:
2090				break;
2091			case TGSI_FILE_SAMPLER:
2092				break;
2093			default:
2094				NOUVEAU_ERR("bad decl file %d\n",
2095					    d->Declaration.File);
2096				goto out_err;
2097			}
2098		}
2099			break;
2100		case TGSI_TOKEN_TYPE_INSTRUCTION:
2101			pc->insn_nr++;
2102			prep_inspect_insn(pc, &tok->FullInstruction);
2103			break;
2104		default:
2105			break;
2106		}
2107	}
2108
2109	if (p->type == PIPE_SHADER_VERTEX) {
2110		int rid = 0;
2111
2112		for (i = 0; i < pc->attr_nr * 4; ++i) {
2113			if (pc->attr[i].acc) {
2114				pc->attr[i].hw = rid++;
2115				p->cfg.attr[i / 32] |= 1 << (i % 32);
2116			}
2117		}
2118
2119		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2120			p->cfg.io[i].hw = rid;
2121			p->cfg.io[i].id_vp = i;
2122
2123			for (c = 0; c < 4; ++c) {
2124				int n = i * 4 + c;
2125				if (!pc->result[n].acc)
2126					continue;
2127				pc->result[n].hw = rid++;
2128				p->cfg.io[i].mask |= 1 << c;
2129			}
2130		}
2131	} else
2132	if (p->type == PIPE_SHADER_FRAGMENT) {
2133		int rid, aid;
2134		unsigned n = 0, m = pc->attr_nr - flat_nr;
2135
2136		int base = (TGSI_SEMANTIC_POSITION ==
2137			    p->info.input_semantic_name[0]) ? 0 : 1;
2138
2139		/* non-flat interpolants have to be mapped to
2140		 * the lower hardware IDs, so sort them:
2141		 */
2142		for (i = 0; i < pc->attr_nr; i++) {
2143			if (pc->interp_mode[i] == INTERP_FLAT) {
2144				p->cfg.io[m].id_vp = i + base;
2145				p->cfg.io[m++].id_fp = i;
2146			} else {
2147				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2148					p->cfg.io[n].linear = TRUE;
2149				p->cfg.io[n].id_vp = i + base;
2150				p->cfg.io[n++].id_fp = i;
2151			}
2152		}
2153
2154		if (!base) /* set w-coordinate mask from perspective interp */
2155			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2156
2157		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2158			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2159
2160		for (n = 0; n < pc->attr_nr; ++n) {
2161			p->cfg.io[n].hw = rid = aid;
2162			i = p->cfg.io[n].id_fp;
2163
2164			for (c = 0; c < 4; ++c) {
2165				if (!pc->attr[i * 4 + c].acc)
2166					continue;
2167				pc->attr[i * 4 + c].rhw = rid++;
2168				p->cfg.io[n].mask |= 1 << c;
2169
2170				load_interpolant(pc, &pc->attr[i * 4 + c]);
2171			}
2172			aid += popcnt4(p->cfg.io[n].mask);
2173		}
2174
2175		if (!base)
2176			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2177
2178		m = popcnt4(p->cfg.regs[1] >> 24);
2179
2180		/* set count of non-position inputs and of non-flat
2181		 * non-position inputs for FP_INTERPOLANT_CTRL
2182		 */
2183		p->cfg.regs[1] |= aid - m;
2184
2185		if (flat_nr) {
2186			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2187			p->cfg.regs[1] |= (i - m) << 16;
2188		} else
2189			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2190
2191		/* mark color semantic for light-twoside */
2192		n = 0x40;
2193		for (i = 0; i < pc->attr_nr; i++) {
2194			ubyte si, sn;
2195
2196			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2197			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2198
2199			if (sn == TGSI_SEMANTIC_COLOR) {
2200				p->cfg.two_side[si] = p->cfg.io[i];
2201
2202				/* increase colour count */
2203				p->cfg.regs[0] += popcnt4(
2204					p->cfg.two_side[si].mask) << 16;
2205
2206				n = MIN2(n, p->cfg.io[i].hw - m);
2207			}
2208		}
2209		if (n < 0x40)
2210			p->cfg.regs[0] += n;
2211
2212		/* Initialize FP results:
2213		 * FragDepth is always first TGSI and last hw output
2214		 */
2215		i = p->info.writes_z ? 4 : 0;
2216		for (rid = 0; i < pc->result_nr * 4; i++)
2217			pc->result[i].rhw = rid++;
2218		if (p->info.writes_z)
2219			pc->result[2].rhw = rid;
2220	}
2221
2222	if (pc->immd_nr) {
2223		int rid = 0;
2224
2225		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2226		if (!pc->immd)
2227			goto out_err;
2228
2229		for (i = 0; i < pc->immd_nr; i++) {
2230			for (c = 0; c < 4; c++, rid++)
2231				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2232		}
2233	}
2234
2235	ret = TRUE;
2236out_err:
2237	if (pc->iv_p)
2238		free_temp(pc, pc->iv_p);
2239	if (pc->iv_c)
2240		free_temp(pc, pc->iv_c);
2241
2242	tgsi_parse_free(&tp);
2243	return ret;
2244}
2245
2246static void
2247free_nv50_pc(struct nv50_pc *pc)
2248{
2249	if (pc->immd)
2250		FREE(pc->immd);
2251	if (pc->param)
2252		FREE(pc->param);
2253	if (pc->result)
2254		FREE(pc->result);
2255	if (pc->attr)
2256		FREE(pc->attr);
2257	if (pc->temp)
2258		FREE(pc->temp);
2259
2260	FREE(pc);
2261}
2262
2263static boolean
2264ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2265{
2266	int i, c;
2267	unsigned rtype[2] = { P_ATTR, P_RESULT };
2268
2269	pc->p = p;
2270	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2271	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2272	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2273	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2274
2275	p->cfg.high_temp = 4;
2276
2277	p->cfg.two_side[0].hw = 0x40;
2278	p->cfg.two_side[1].hw = 0x40;
2279
2280	switch (p->type) {
2281	case PIPE_SHADER_VERTEX:
2282		p->cfg.clpd = 0x40;
2283		p->cfg.io_nr = pc->result_nr;
2284		break;
2285	case PIPE_SHADER_FRAGMENT:
2286		rtype[0] = rtype[1] = P_TEMP;
2287
2288		p->cfg.regs[0] = 0x01000004;
2289		p->cfg.io_nr = pc->attr_nr;
2290
2291		if (p->info.writes_z) {
2292			p->cfg.regs[2] |= 0x00000100;
2293			p->cfg.regs[3] |= 0x00000011;
2294		}
2295		if (p->info.uses_kill)
2296			p->cfg.regs[2] |= 0x00100000;
2297		break;
2298	}
2299
2300	if (pc->temp_nr) {
2301		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2302		if (!pc->temp)
2303			return FALSE;
2304
2305		for (i = 0; i < pc->temp_nr * 4; ++i)
2306			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2307	}
2308
2309	if (pc->attr_nr) {
2310		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2311		if (!pc->attr)
2312			return FALSE;
2313
2314		for (i = 0; i < pc->attr_nr * 4; ++i)
2315			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2316	}
2317
2318	if (pc->result_nr) {
2319		unsigned nr = pc->result_nr * 4;
2320
2321		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2322		if (!pc->result)
2323			return FALSE;
2324
2325		for (i = 0; i < nr; ++i)
2326			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2327	}
2328
2329	if (pc->param_nr) {
2330		int rid = 0;
2331
2332		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2333		if (!pc->param)
2334			return FALSE;
2335
2336		for (i = 0; i < pc->param_nr; ++i)
2337			for (c = 0; c < 4; ++c, ++rid)
2338				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2339	}
2340
2341	return TRUE;
2342}
2343
2344static boolean
2345nv50_program_tx(struct nv50_program *p)
2346{
2347	struct tgsi_parse_context parse;
2348	struct nv50_pc *pc;
2349	unsigned k;
2350	boolean ret;
2351
2352	pc = CALLOC_STRUCT(nv50_pc);
2353	if (!pc)
2354		return FALSE;
2355
2356	ret = ctor_nv50_pc(pc, p);
2357	if (ret == FALSE)
2358		goto out_cleanup;
2359
2360	ret = nv50_program_tx_prep(pc);
2361	if (ret == FALSE)
2362		goto out_cleanup;
2363
2364	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2365	while (!tgsi_parse_end_of_tokens(&parse)) {
2366		const union tgsi_full_token *tok = &parse.FullToken;
2367
2368		/* don't allow half insn/immd on first and last instruction */
2369		pc->allow32 = TRUE;
2370		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2371			pc->allow32 = FALSE;
2372
2373		tgsi_parse_token(&parse);
2374
2375		switch (tok->Token.Type) {
2376		case TGSI_TOKEN_TYPE_INSTRUCTION:
2377			++pc->insn_cur;
2378			ret = nv50_tgsi_insn(pc, tok);
2379			if (ret == FALSE)
2380				goto out_err;
2381			break;
2382		default:
2383			break;
2384		}
2385	}
2386
2387	if (p->type == PIPE_SHADER_FRAGMENT) {
2388		struct nv50_reg out;
2389		ctor_reg(&out, P_TEMP, -1, -1);
2390
2391		for (k = 0; k < pc->result_nr * 4; k++) {
2392			if (pc->result[k].rhw == -1)
2393				continue;
2394			if (pc->result[k].hw != pc->result[k].rhw) {
2395				out.hw = pc->result[k].rhw;
2396				emit_mov(pc, &out, &pc->result[k]);
2397			}
2398			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2399				pc->p->cfg.high_result = pc->result[k].rhw + 1;
2400		}
2401	}
2402
2403	/* look for single half instructions and make them long */
2404	struct nv50_program_exec *e, *e_prev;
2405
2406	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2407		if (!is_long(e))
2408			k++;
2409
2410		if (!e->next || is_long(e->next)) {
2411			if (k & 1)
2412				convert_to_long(pc, e);
2413			k = 0;
2414		}
2415
2416		if (e->next)
2417			e_prev = e;
2418	}
2419
2420	if (!is_long(pc->p->exec_tail)) {
2421		/* this may occur if moving FP results */
2422		assert(e_prev && !is_long(e_prev));
2423		convert_to_long(pc, e_prev);
2424		convert_to_long(pc, pc->p->exec_tail);
2425	}
2426
2427	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2428	pc->p->exec_tail->inst[1] |= 0x00000001;
2429
2430	p->param_nr = pc->param_nr * 4;
2431	p->immd_nr = pc->immd_nr * 4;
2432	p->immd = pc->immd_buf;
2433
2434out_err:
2435	tgsi_parse_free(&parse);
2436
2437out_cleanup:
2438	free_nv50_pc(pc);
2439	return ret;
2440}
2441
2442static void
2443nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2444{
2445	if (nv50_program_tx(p) == FALSE)
2446		assert(0);
2447	p->translated = TRUE;
2448}
2449
2450static void
2451nv50_program_upload_data(struct nv50_context *nv50, float *map,
2452			unsigned start, unsigned count, unsigned cbuf)
2453{
2454	struct nouveau_channel *chan = nv50->screen->base.channel;
2455	struct nouveau_grobj *tesla = nv50->screen->tesla;
2456
2457	while (count) {
2458		unsigned nr = count > 2047 ? 2047 : count;
2459
2460		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2461		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2462		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2463		OUT_RINGp (chan, map, nr);
2464
2465		map += nr;
2466		start += nr;
2467		count -= nr;
2468	}
2469}
2470
2471static void
2472nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2473{
2474	struct pipe_screen *pscreen = nv50->pipe.screen;
2475
2476	if (!p->data[0] && p->immd_nr) {
2477		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2478
2479		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2480			while (heap->next && heap->size < p->immd_nr) {
2481				struct nv50_program *evict = heap->next->priv;
2482				nouveau_resource_free(&evict->data[0]);
2483			}
2484
2485			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2486						   &p->data[0]))
2487				assert(0);
2488		}
2489
2490		/* immediates only need to be uploaded again when freed */
2491		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2492					 p->immd_nr, NV50_CB_PMISC);
2493	}
2494
2495	if (!p->data[1] && p->param_nr) {
2496		struct nouveau_resource *heap =
2497			nv50->screen->parm_heap[p->type];
2498
2499		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
2500			while (heap->next && heap->size < p->param_nr) {
2501				struct nv50_program *evict = heap->next->priv;
2502				nouveau_resource_free(&evict->data[1]);
2503			}
2504
2505			if (nouveau_resource_alloc(heap, p->param_nr, p,
2506						   &p->data[1]))
2507				assert(0);
2508		}
2509	}
2510
2511	if (p->param_nr) {
2512		unsigned cbuf = NV50_CB_PVP;
2513		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2514					     PIPE_BUFFER_USAGE_CPU_READ);
2515		if (p->type == PIPE_SHADER_FRAGMENT)
2516			cbuf = NV50_CB_PFP;
2517		nv50_program_upload_data(nv50, map, p->data[1]->start,
2518					 p->param_nr, cbuf);
2519		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2520	}
2521}
2522
2523static void
2524nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2525{
2526	struct nouveau_channel *chan = nv50->screen->base.channel;
2527	struct nouveau_grobj *tesla = nv50->screen->tesla;
2528	struct nv50_program_exec *e;
2529	struct nouveau_stateobj *so;
2530	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2531	unsigned start, count, *up, *ptr;
2532	boolean upload = FALSE;
2533
2534	if (!p->bo) {
2535		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2536			       p->exec_size * 4, &p->bo);
2537		upload = TRUE;
2538	}
2539
2540	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2541		(p->data[1] && p->data[1]->start != p->data_start[1])) {
2542		for (e = p->exec_head; e; e = e->next) {
2543			unsigned ei, ci, bs;
2544
2545			if (e->param.index < 0)
2546				continue;
2547			bs = (e->inst[1] >> 22) & 0x07;
2548			assert(bs < 2);
2549			ei = e->param.shift >> 5;
2550			ci = e->param.index + p->data[bs]->start;
2551
2552			e->inst[ei] &= ~e->param.mask;
2553			e->inst[ei] |= (ci << e->param.shift);
2554		}
2555
2556		if (p->data[0])
2557			p->data_start[0] = p->data[0]->start;
2558		if (p->data[1])
2559			p->data_start[1] = p->data[1]->start;
2560
2561		upload = TRUE;
2562	}
2563
2564	if (!upload)
2565		return;
2566
2567#ifdef NV50_PROGRAM_DUMP
2568	NOUVEAU_ERR("-------\n");
2569	for (e = p->exec_head; e; e = e->next) {
2570		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2571		if (is_long(e))
2572			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2573	}
2574#endif
2575
2576	up = ptr = MALLOC(p->exec_size * 4);
2577	for (e = p->exec_head; e; e = e->next) {
2578		*(ptr++) = e->inst[0];
2579		if (is_long(e))
2580			*(ptr++) = e->inst[1];
2581	}
2582
2583	so = so_new(4,2);
2584	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2585	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2586	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2587	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2588
2589	start = 0; count = p->exec_size;
2590	while (count) {
2591		struct nouveau_channel *chan = nv50->screen->base.channel;
2592		unsigned nr;
2593
2594		so_emit(chan, so);
2595
2596		nr = MIN2(count, 2047);
2597		nr = MIN2(chan->pushbuf->remaining, nr);
2598		if (chan->pushbuf->remaining < (nr + 3)) {
2599			FIRE_RING(chan);
2600			continue;
2601		}
2602
2603		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2604		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2605		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2606		OUT_RINGp (chan, up + start, nr);
2607
2608		start += nr;
2609		count -= nr;
2610	}
2611
2612	FREE(up);
2613	so_ref(NULL, &so);
2614}
2615
2616void
2617nv50_vertprog_validate(struct nv50_context *nv50)
2618{
2619	struct nouveau_grobj *tesla = nv50->screen->tesla;
2620	struct nv50_program *p = nv50->vertprog;
2621	struct nouveau_stateobj *so;
2622
2623	if (!p->translated) {
2624		nv50_program_validate(nv50, p);
2625		if (!p->translated)
2626			assert(0);
2627	}
2628
2629	nv50_program_validate_data(nv50, p);
2630	nv50_program_validate_code(nv50, p);
2631
2632	so = so_new(13, 2);
2633	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2634	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2635		      NOUVEAU_BO_HIGH, 0, 0);
2636	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2637		      NOUVEAU_BO_LOW, 0, 0);
2638	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2639	so_data  (so, p->cfg.attr[0]);
2640	so_data  (so, p->cfg.attr[1]);
2641	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2642	so_data  (so, p->cfg.high_result);
2643	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2644	so_data  (so, p->cfg.high_result); //8);
2645	so_data  (so, p->cfg.high_temp);
2646	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2647	so_data  (so, 0); /* program start offset */
2648	so_ref(so, &nv50->state.vertprog);
2649	so_ref(NULL, &so);
2650}
2651
2652void
2653nv50_fragprog_validate(struct nv50_context *nv50)
2654{
2655	struct nouveau_grobj *tesla = nv50->screen->tesla;
2656	struct nv50_program *p = nv50->fragprog;
2657	struct nouveau_stateobj *so;
2658
2659	if (!p->translated) {
2660		nv50_program_validate(nv50, p);
2661		if (!p->translated)
2662			assert(0);
2663	}
2664
2665	nv50_program_validate_data(nv50, p);
2666	nv50_program_validate_code(nv50, p);
2667
2668	so = so_new(64, 2);
2669	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2670	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2671		      NOUVEAU_BO_HIGH, 0, 0);
2672	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2673		      NOUVEAU_BO_LOW, 0, 0);
2674	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2675	so_data  (so, p->cfg.high_temp);
2676	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2677	so_data  (so, p->cfg.high_result);
2678	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2679	so_data  (so, p->cfg.regs[2]);
2680	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2681	so_data  (so, p->cfg.regs[3]);
2682	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2683	so_data  (so, 0); /* program start offset */
2684	so_ref(so, &nv50->state.fragprog);
2685	so_ref(NULL, &so);
2686}
2687
2688static int
2689nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
2690	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
2691{
2692	int c;
2693	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
2694	uint8_t *map = (uint8_t *)p_map;
2695
2696	for (c = 0; c < 4; ++c) {
2697		if (mf & 1) {
2698			if (fpi->linear == TRUE)
2699				lin[mid / 32] |= 1 << (mid % 32);
2700			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
2701		}
2702
2703		oid += mv & 1;
2704		mf >>= 1;
2705		mv >>= 1;
2706	}
2707
2708	return mid;
2709}
2710
2711void
2712nv50_linkage_validate(struct nv50_context *nv50)
2713{
2714	struct nouveau_grobj *tesla = nv50->screen->tesla;
2715	struct nv50_program *vp = nv50->vertprog;
2716	struct nv50_program *fp = nv50->fragprog;
2717	struct nouveau_stateobj *so;
2718	struct nv50_sreg4 dummy, *vpo;
2719	int i, n, c, m = 0;
2720	uint32_t map[16], lin[4], reg[5];
2721
2722	memset(map, 0, sizeof(map));
2723	memset(lin, 0, sizeof(lin));
2724
2725	reg[1] = 0x00000004; /* low and high clip distance map ids */
2726	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
2727	reg[3] = 0x00000000; /* point size map id & enable */
2728	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
2729	reg[4] = fp->cfg.regs[1]; /* interpolant info */
2730
2731	dummy.linear = FALSE;
2732	dummy.mask = 0xf; /* map all components of HPOS */
2733	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
2734
2735	dummy.mask = 0x0;
2736
2737	if (vp->cfg.clpd < 0x40) {
2738		for (c = 0; c < vp->cfg.clpd_nr; ++c)
2739			map[m++] = vp->cfg.clpd + c;
2740		reg[1] = (m << 8);
2741	}
2742
2743	reg[0] |= m << 8; /* adjust BFC0 id */
2744	reg[0] += m - 4; /* adjust FFC0 id */
2745	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
2746
2747	i = 0;
2748	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
2749		i = 1;
2750	for (; i < fp->cfg.io_nr; i++) {
2751		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
2752		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
2753
2754		n = fp->cfg.io[i].id_vp;
2755		if (n >= vp->cfg.io_nr ||
2756		    vp->info.output_semantic_name[n] != sn ||
2757		    vp->info.output_semantic_index[n] != si)
2758			vpo = &dummy;
2759		else
2760			vpo = &vp->cfg.io[n];
2761
2762		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
2763	}
2764
2765	/* now fill the stateobj */
2766	so = so_new(64, 0);
2767
2768	n = (m + 3) / 4;
2769	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
2770	so_data  (so, m);
2771	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
2772	so_datap (so, map, n);
2773
2774	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2775	so_datap (so, reg, 4);
2776
2777	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
2778	so_data  (so, reg[4]);
2779
2780	so_method(so, tesla, 0x1540, 4);
2781	so_datap (so, lin, 4);
2782
2783        so_ref(so, &nv50->state.programs);
2784        so_ref(NULL, &so);
2785}
2786
2787void
2788nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2789{
2790	while (p->exec_head) {
2791		struct nv50_program_exec *e = p->exec_head;
2792
2793		p->exec_head = e->next;
2794		FREE(e);
2795	}
2796	p->exec_tail = NULL;
2797	p->exec_size = 0;
2798
2799	nouveau_bo_ref(NULL, &p->bo);
2800
2801	nouveau_resource_free(&p->data[0]);
2802	nouveau_resource_free(&p->data[1]);
2803
2804	p->translated = 0;
2805}
2806