nv50_program.c revision 4d7b4781c82c60d646ee5e766824a0f894e4c292
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93struct nv50_pc {
94	struct nv50_program *p;
95
96	/* hw resources */
97	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	/* broadcast and destination replacement regs */
116	struct nv50_reg *r_brdc;
117	struct nv50_reg *r_dst[4];
118
119	unsigned interp_mode[32];
120	/* perspective interpolation registers */
121	struct nv50_reg *iv_p;
122	struct nv50_reg *iv_c;
123
124	/* current instruction and total number of insns */
125	unsigned insn_cur;
126	unsigned insn_nr;
127
128	boolean allow32;
129};
130
131static INLINE void
132ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
133{
134	reg->type = type;
135	reg->index = index;
136	reg->hw = hw;
137	reg->neg = 0;
138	reg->rhw = -1;
139	reg->acc = 0;
140}
141
142static INLINE unsigned
143popcnt4(uint32_t val)
144{
145	static const unsigned cnt[16]
146	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
147	return cnt[val & 0xf];
148}
149
150static void
151alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
152{
153	int i = 0;
154
155	if (reg->type == P_RESULT) {
156		if (pc->p->cfg.high_result < (reg->hw + 1))
157			pc->p->cfg.high_result = reg->hw + 1;
158	}
159
160	if (reg->type != P_TEMP)
161		return;
162
163	if (reg->hw >= 0) {
164		/*XXX: do this here too to catch FP temp-as-attr usage..
165		 *     not clean, but works */
166		if (pc->p->cfg.high_temp < (reg->hw + 1))
167			pc->p->cfg.high_temp = reg->hw + 1;
168		return;
169	}
170
171	if (reg->rhw != -1) {
172		/* try to allocate temporary with index rhw first */
173		if (!(pc->r_temp[reg->rhw])) {
174			pc->r_temp[reg->rhw] = reg;
175			reg->hw = reg->rhw;
176			if (pc->p->cfg.high_temp < (reg->rhw + 1))
177				pc->p->cfg.high_temp = reg->rhw + 1;
178			return;
179		}
180		/* make sure we don't get things like $r0 needs to go
181		 * in $r1 and $r1 in $r0
182		 */
183		i = pc->result_nr * 4;
184	}
185
186	for (; i < NV50_SU_MAX_TEMP; i++) {
187		if (!(pc->r_temp[i])) {
188			pc->r_temp[i] = reg;
189			reg->hw = i;
190			if (pc->p->cfg.high_temp < (i + 1))
191				pc->p->cfg.high_temp = i + 1;
192			return;
193		}
194	}
195
196	assert(0);
197}
198
199static struct nv50_reg *
200alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
201{
202	struct nv50_reg *r;
203	int i;
204
205	if (dst && dst->type == P_TEMP && dst->hw == -1)
206		return dst;
207
208	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
209		if (!pc->r_temp[i]) {
210			r = MALLOC_STRUCT(nv50_reg);
211			ctor_reg(r, P_TEMP, -1, i);
212			pc->r_temp[i] = r;
213			return r;
214		}
215	}
216
217	assert(0);
218	return NULL;
219}
220
221/* Assign the hw of the discarded temporary register src
222 * to the tgsi register dst and free src.
223 */
224static void
225assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
226{
227	assert(src->index == -1 && src->hw != -1);
228
229	if (dst->hw != -1)
230		pc->r_temp[dst->hw] = NULL;
231	pc->r_temp[src->hw] = dst;
232	dst->hw = src->hw;
233
234	FREE(src);
235}
236
237/* release the hardware resource held by r */
238static void
239release_hw(struct nv50_pc *pc, struct nv50_reg *r)
240{
241	assert(r->type == P_TEMP);
242	if (r->hw == -1)
243		return;
244
245	assert(pc->r_temp[r->hw] == r);
246	pc->r_temp[r->hw] = NULL;
247
248	r->acc = 0;
249	if (r->index == -1)
250		FREE(r);
251}
252
253static void
254free_temp(struct nv50_pc *pc, struct nv50_reg *r)
255{
256	if (r->index == -1) {
257		unsigned hw = r->hw;
258
259		FREE(pc->r_temp[hw]);
260		pc->r_temp[hw] = NULL;
261	}
262}
263
264static int
265alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
266{
267	int i;
268
269	if ((idx + 4) >= NV50_SU_MAX_TEMP)
270		return 1;
271
272	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
273	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
274		return alloc_temp4(pc, dst, idx + 4);
275
276	for (i = 0; i < 4; i++) {
277		dst[i] = MALLOC_STRUCT(nv50_reg);
278		ctor_reg(dst[i], P_TEMP, -1, idx + i);
279		pc->r_temp[idx + i] = dst[i];
280	}
281
282	return 0;
283}
284
285static void
286free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
287{
288	int i;
289
290	for (i = 0; i < 4; i++)
291		free_temp(pc, reg[i]);
292}
293
294static struct nv50_reg *
295temp_temp(struct nv50_pc *pc)
296{
297	if (pc->temp_temp_nr >= 16)
298		assert(0);
299
300	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
301	return pc->temp_temp[pc->temp_temp_nr++];
302}
303
304static void
305kill_temp_temp(struct nv50_pc *pc)
306{
307	int i;
308
309	for (i = 0; i < pc->temp_temp_nr; i++)
310		free_temp(pc, pc->temp_temp[i]);
311	pc->temp_temp_nr = 0;
312}
313
314static int
315ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
316{
317	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
318			       (pc->immd_nr + 1) * 4 * sizeof(float));
319	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
320	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
321	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
322	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
323
324	return pc->immd_nr++;
325}
326
327static struct nv50_reg *
328alloc_immd(struct nv50_pc *pc, float f)
329{
330	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
331	unsigned hw;
332
333	for (hw = 0; hw < pc->immd_nr * 4; hw++)
334		if (pc->immd_buf[hw] == f)
335			break;
336
337	if (hw == pc->immd_nr * 4)
338		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
339
340	ctor_reg(r, P_IMMD, -1, hw);
341	return r;
342}
343
344static struct nv50_program_exec *
345exec(struct nv50_pc *pc)
346{
347	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
348
349	e->param.index = -1;
350	return e;
351}
352
353static void
354emit(struct nv50_pc *pc, struct nv50_program_exec *e)
355{
356	struct nv50_program *p = pc->p;
357
358	if (p->exec_tail)
359		p->exec_tail->next = e;
360	if (!p->exec_head)
361		p->exec_head = e;
362	p->exec_tail = e;
363	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
364}
365
366static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
367
368static boolean
369is_long(struct nv50_program_exec *e)
370{
371	if (e->inst[0] & 1)
372		return TRUE;
373	return FALSE;
374}
375
376static boolean
377is_immd(struct nv50_program_exec *e)
378{
379	if (is_long(e) && (e->inst[1] & 3) == 3)
380		return TRUE;
381	return FALSE;
382}
383
384static INLINE void
385set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
386	 struct nv50_program_exec *e)
387{
388	set_long(pc, e);
389	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
390	e->inst[1] |= (pred << 7) | (idx << 12);
391}
392
393static INLINE void
394set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
395	    struct nv50_program_exec *e)
396{
397	set_long(pc, e);
398	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
399	e->inst[1] |= (idx << 4) | (on << 6);
400}
401
402static INLINE void
403set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
404{
405	if (is_long(e))
406		return;
407
408	e->inst[0] |= 1;
409	set_pred(pc, 0xf, 0, e);
410	set_pred_wr(pc, 0, 0, e);
411}
412
413static INLINE void
414set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
415{
416	if (dst->type == P_RESULT) {
417		set_long(pc, e);
418		e->inst[1] |= 0x00000008;
419	}
420
421	alloc_reg(pc, dst);
422	e->inst[0] |= (dst->hw << 2);
423}
424
425static INLINE void
426set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
427{
428	float f = pc->immd_buf[imm->hw];
429	unsigned val = fui(imm->neg ? -f : f);
430
431	set_long(pc, e);
432	/*XXX: can't be predicated - bits overlap.. catch cases where both
433	 *     are required and avoid them. */
434	set_pred(pc, 0, 0, e);
435	set_pred_wr(pc, 0, 0, e);
436
437	e->inst[1] |= 0x00000002 | 0x00000001;
438	e->inst[0] |= (val & 0x3f) << 16;
439	e->inst[1] |= (val >> 6) << 2;
440}
441
442
443#define INTERP_LINEAR		0
444#define INTERP_FLAT			1
445#define INTERP_PERSPECTIVE	2
446#define INTERP_CENTROID		4
447
448/* interpolant index has been stored in dst->rhw */
449static void
450emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
451		unsigned mode)
452{
453	assert(dst->rhw != -1);
454	struct nv50_program_exec *e = exec(pc);
455
456	e->inst[0] |= 0x80000000;
457	set_dst(pc, dst, e);
458	e->inst[0] |= (dst->rhw << 16);
459
460	if (mode & INTERP_FLAT) {
461		e->inst[0] |= (1 << 8);
462	} else {
463		if (mode & INTERP_PERSPECTIVE) {
464			e->inst[0] |= (1 << 25);
465			alloc_reg(pc, iv);
466			e->inst[0] |= (iv->hw << 9);
467		}
468
469		if (mode & INTERP_CENTROID)
470			e->inst[0] |= (1 << 24);
471	}
472
473	emit(pc, e);
474}
475
476static void
477set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
478	 struct nv50_program_exec *e)
479{
480	set_long(pc, e);
481
482	e->param.index = src->hw;
483	e->param.shift = s;
484	e->param.mask = m << (s % 32);
485
486	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
487}
488
489static void
490emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
491{
492	struct nv50_program_exec *e = exec(pc);
493
494	e->inst[0] |= 0x10000000;
495
496	set_dst(pc, dst, e);
497
498	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
499		set_immd(pc, src, e);
500		/*XXX: 32-bit, but steals part of "half" reg space - need to
501		 *     catch and handle this case if/when we do half-regs
502		 */
503	} else
504	if (src->type == P_IMMD || src->type == P_CONST) {
505		set_long(pc, e);
506		set_data(pc, src, 0x7f, 9, e);
507		e->inst[1] |= 0x20000000; /* src0 const? */
508	} else {
509		if (src->type == P_ATTR) {
510			set_long(pc, e);
511			e->inst[1] |= 0x00200000;
512		}
513
514		alloc_reg(pc, src);
515		e->inst[0] |= (src->hw << 9);
516	}
517
518	if (is_long(e) && !is_immd(e)) {
519		e->inst[1] |= 0x04000000; /* 32-bit */
520		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
521		if (!(e->inst[1] & 0x20000000))
522			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
523	} else
524		e->inst[0] |= 0x00008000;
525
526	emit(pc, e);
527}
528
529static INLINE void
530emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
531{
532	struct nv50_reg *imm = alloc_immd(pc, f);
533	emit_mov(pc, dst, imm);
534	FREE(imm);
535}
536
537static boolean
538check_swap_src_0_1(struct nv50_pc *pc,
539		   struct nv50_reg **s0, struct nv50_reg **s1)
540{
541	struct nv50_reg *src0 = *s0, *src1 = *s1;
542
543	if (src0->type == P_CONST) {
544		if (src1->type != P_CONST) {
545			*s0 = src1;
546			*s1 = src0;
547			return TRUE;
548		}
549	} else
550	if (src1->type == P_ATTR) {
551		if (src0->type != P_ATTR) {
552			*s0 = src1;
553			*s1 = src0;
554			return TRUE;
555		}
556	}
557
558	return FALSE;
559}
560
561static void
562set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
563{
564	if (src->type == P_ATTR) {
565		set_long(pc, e);
566		e->inst[1] |= 0x00200000;
567	} else
568	if (src->type == P_CONST || src->type == P_IMMD) {
569		struct nv50_reg *temp = temp_temp(pc);
570
571		emit_mov(pc, temp, src);
572		src = temp;
573	}
574
575	alloc_reg(pc, src);
576	e->inst[0] |= (src->hw << 9);
577}
578
579static void
580set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
581{
582	if (src->type == P_ATTR) {
583		struct nv50_reg *temp = temp_temp(pc);
584
585		emit_mov(pc, temp, src);
586		src = temp;
587	} else
588	if (src->type == P_CONST || src->type == P_IMMD) {
589		assert(!(e->inst[0] & 0x00800000));
590		if (e->inst[0] & 0x01000000) {
591			struct nv50_reg *temp = temp_temp(pc);
592
593			emit_mov(pc, temp, src);
594			src = temp;
595		} else {
596			set_data(pc, src, 0x7f, 16, e);
597			e->inst[0] |= 0x00800000;
598		}
599	}
600
601	alloc_reg(pc, src);
602	e->inst[0] |= (src->hw << 16);
603}
604
605static void
606set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
607{
608	set_long(pc, e);
609
610	if (src->type == P_ATTR) {
611		struct nv50_reg *temp = temp_temp(pc);
612
613		emit_mov(pc, temp, src);
614		src = temp;
615	} else
616	if (src->type == P_CONST || src->type == P_IMMD) {
617		assert(!(e->inst[0] & 0x01000000));
618		if (e->inst[0] & 0x00800000) {
619			struct nv50_reg *temp = temp_temp(pc);
620
621			emit_mov(pc, temp, src);
622			src = temp;
623		} else {
624			set_data(pc, src, 0x7f, 32+14, e);
625			e->inst[0] |= 0x01000000;
626		}
627	}
628
629	alloc_reg(pc, src);
630	e->inst[1] |= (src->hw << 14);
631}
632
633static void
634emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
635	 struct nv50_reg *src1)
636{
637	struct nv50_program_exec *e = exec(pc);
638
639	e->inst[0] |= 0xc0000000;
640
641	if (!pc->allow32)
642		set_long(pc, e);
643
644	check_swap_src_0_1(pc, &src0, &src1);
645	set_dst(pc, dst, e);
646	set_src_0(pc, src0, e);
647	if (src1->type == P_IMMD && !is_long(e)) {
648		if (src0->neg)
649			e->inst[0] |= 0x00008000;
650		set_immd(pc, src1, e);
651	} else {
652		set_src_1(pc, src1, e);
653		if (src0->neg ^ src1->neg) {
654			if (is_long(e))
655				e->inst[1] |= 0x08000000;
656			else
657				e->inst[0] |= 0x00008000;
658		}
659	}
660
661	emit(pc, e);
662}
663
664static void
665emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
666	 struct nv50_reg *src0, struct nv50_reg *src1)
667{
668	struct nv50_program_exec *e = exec(pc);
669
670	e->inst[0] |= 0xb0000000;
671
672	check_swap_src_0_1(pc, &src0, &src1);
673
674	if (!pc->allow32 || src0->neg || src1->neg) {
675		set_long(pc, e);
676		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
677	}
678
679	set_dst(pc, dst, e);
680	set_src_0(pc, src0, e);
681	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
682		set_src_2(pc, src1, e);
683	else
684	if (src1->type == P_IMMD)
685		set_immd(pc, src1, e);
686	else
687		set_src_1(pc, src1, e);
688
689	emit(pc, e);
690}
691
692static void
693emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
694	    struct nv50_reg *src0, struct nv50_reg *src1)
695{
696	struct nv50_program_exec *e = exec(pc);
697
698	set_long(pc, e);
699	e->inst[0] |= 0xb0000000;
700	e->inst[1] |= (sub << 29);
701
702	check_swap_src_0_1(pc, &src0, &src1);
703	set_dst(pc, dst, e);
704	set_src_0(pc, src0, e);
705	set_src_1(pc, src1, e);
706
707	emit(pc, e);
708}
709
710static INLINE void
711emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
712	 struct nv50_reg *src1)
713{
714	src1->neg ^= 1;
715	emit_add(pc, dst, src0, src1);
716	src1->neg ^= 1;
717}
718
719static void
720emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
721	 struct nv50_reg *src1, struct nv50_reg *src2)
722{
723	struct nv50_program_exec *e = exec(pc);
724
725	e->inst[0] |= 0xe0000000;
726
727	check_swap_src_0_1(pc, &src0, &src1);
728	set_dst(pc, dst, e);
729	set_src_0(pc, src0, e);
730	set_src_1(pc, src1, e);
731	set_src_2(pc, src2, e);
732
733	if (src0->neg ^ src1->neg)
734		e->inst[1] |= 0x04000000;
735	if (src2->neg)
736		e->inst[1] |= 0x08000000;
737
738	emit(pc, e);
739}
740
741static INLINE void
742emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
743	 struct nv50_reg *src1, struct nv50_reg *src2)
744{
745	src2->neg ^= 1;
746	emit_mad(pc, dst, src0, src1, src2);
747	src2->neg ^= 1;
748}
749
750static void
751emit_flop(struct nv50_pc *pc, unsigned sub,
752	  struct nv50_reg *dst, struct nv50_reg *src)
753{
754	struct nv50_program_exec *e = exec(pc);
755
756	e->inst[0] |= 0x90000000;
757	if (sub) {
758		set_long(pc, e);
759		e->inst[1] |= (sub << 29);
760	}
761
762	set_dst(pc, dst, e);
763	set_src_0(pc, src, e);
764
765	emit(pc, e);
766}
767
768static void
769emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
770{
771	struct nv50_program_exec *e = exec(pc);
772
773	e->inst[0] |= 0xb0000000;
774
775	set_dst(pc, dst, e);
776	set_src_0(pc, src, e);
777	set_long(pc, e);
778	e->inst[1] |= (6 << 29) | 0x00004000;
779
780	emit(pc, e);
781}
782
783static void
784emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
785{
786	struct nv50_program_exec *e = exec(pc);
787
788	e->inst[0] |= 0xb0000000;
789
790	set_dst(pc, dst, e);
791	set_src_0(pc, src, e);
792	set_long(pc, e);
793	e->inst[1] |= (6 << 29);
794
795	emit(pc, e);
796}
797
798#define CVTOP_RN	0x01
799#define CVTOP_FLOOR	0x03
800#define CVTOP_CEIL	0x05
801#define CVTOP_TRUNC	0x07
802#define CVTOP_SAT	0x08
803#define CVTOP_ABS	0x10
804
805/* 0x04 == 32 bit */
806/* 0x40 == dst is float */
807/* 0x80 == src is float */
808#define CVT_F32_F32 0xc4
809#define CVT_F32_S32 0x44
810#define CVT_F32_U32 0x64
811#define CVT_S32_F32 0x8c
812#define CVT_S32_S32 0x0c
813#define CVT_F32_F32_ROP 0xcc
814
815static void
816emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
817	 int wp, unsigned cvn, unsigned fmt)
818{
819	struct nv50_program_exec *e;
820
821	e = exec(pc);
822	set_long(pc, e);
823
824	e->inst[0] |= 0xa0000000;
825	e->inst[1] |= 0x00004000;
826	e->inst[1] |= (cvn << 16);
827	e->inst[1] |= (fmt << 24);
828	set_src_0(pc, src, e);
829
830	if (wp >= 0)
831		set_pred_wr(pc, 1, wp, e);
832
833	if (dst)
834		set_dst(pc, dst, e);
835	else {
836		e->inst[0] |= 0x000001fc;
837		e->inst[1] |= 0x00000008;
838	}
839
840	emit(pc, e);
841}
842
843/* nv50 Condition codes:
844 *  0x1 = LT
845 *  0x2 = EQ
846 *  0x3 = LE
847 *  0x4 = GT
848 *  0x5 = NE
849 *  0x6 = GE
850 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
851 *  0x8 = unordered bit (allows NaN)
852 */
853static void
854emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
855	 struct nv50_reg *src0, struct nv50_reg *src1)
856{
857	struct nv50_program_exec *e = exec(pc);
858	struct nv50_reg *rdst;
859
860	assert(ccode < 16);
861	if (check_swap_src_0_1(pc, &src0, &src1))
862		ccode = ccode ^ 0x7;
863
864	rdst = dst;
865	if (dst && dst->type != P_TEMP)
866		dst = alloc_temp(pc, NULL);
867
868	/* set.u32 */
869	set_long(pc, e);
870	e->inst[0] |= 0xb0000000;
871	e->inst[1] |= 0x60000000 | (ccode << 14);
872
873	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
874	 * that doesn't seem to match what the hw actually does
875	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
876	 */
877
878	if (wp >= 0)
879		set_pred_wr(pc, 1, wp, e);
880	if (dst)
881		set_dst(pc, dst, e);
882	else {
883		e->inst[0] |= 0x000001fc;
884		e->inst[1] |= 0x00000008;
885	}
886
887	set_src_0(pc, src0, e);
888	set_src_1(pc, src1, e);
889
890	emit(pc, e);
891
892	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
893	if (rdst)
894		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
895	if (rdst && rdst != dst)
896		free_temp(pc, dst);
897}
898
899static INLINE unsigned
900map_tgsi_setop_cc(unsigned op)
901{
902	switch (op) {
903	case TGSI_OPCODE_SLT: return 0x1;
904	case TGSI_OPCODE_SGE: return 0x6;
905	case TGSI_OPCODE_SEQ: return 0x2;
906	case TGSI_OPCODE_SGT: return 0x4;
907	case TGSI_OPCODE_SLE: return 0x3;
908	case TGSI_OPCODE_SNE: return 0xd;
909	default:
910		assert(0);
911		return 0;
912	}
913}
914
915static INLINE void
916emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
917{
918	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
919}
920
921static void
922emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
923	 struct nv50_reg *v, struct nv50_reg *e)
924{
925	struct nv50_reg *temp = alloc_temp(pc, NULL);
926
927	emit_flop(pc, 3, temp, v);
928	emit_mul(pc, temp, temp, e);
929	emit_preex2(pc, temp, temp);
930	emit_flop(pc, 6, dst, temp);
931
932	free_temp(pc, temp);
933}
934
935static INLINE void
936emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
937{
938	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
939}
940
941static INLINE void
942emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
943{
944	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
945}
946
947static void
948emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
949	 struct nv50_reg **src)
950{
951	struct nv50_reg *one = alloc_immd(pc, 1.0);
952	struct nv50_reg *zero = alloc_immd(pc, 0.0);
953	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
954	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
955	struct nv50_reg *tmp[4];
956	boolean allow32 = pc->allow32;
957
958	pc->allow32 = FALSE;
959
960	if (mask & (3 << 1)) {
961		tmp[0] = alloc_temp(pc, NULL);
962		emit_minmax(pc, 4, tmp[0], src[0], zero);
963	}
964
965	if (mask & (1 << 2)) {
966		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
967
968		tmp[1] = temp_temp(pc);
969		emit_minmax(pc, 4, tmp[1], src[1], zero);
970
971		tmp[3] = temp_temp(pc);
972		emit_minmax(pc, 4, tmp[3], src[3], neg128);
973		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
974
975		emit_pow(pc, dst[2], tmp[1], tmp[3]);
976		emit_mov(pc, dst[2], zero);
977		set_pred(pc, 3, 0, pc->p->exec_tail);
978	}
979
980	if (mask & (1 << 1))
981		assimilate_temp(pc, dst[1], tmp[0]);
982	else
983	if (mask & (1 << 2))
984		free_temp(pc, tmp[0]);
985
986	pc->allow32 = allow32;
987
988	/* do this last, in case src[i,j] == dst[0,3] */
989	if (mask & (1 << 0))
990		emit_mov(pc, dst[0], one);
991
992	if (mask & (1 << 3))
993		emit_mov(pc, dst[3], one);
994
995	FREE(pos128);
996	FREE(neg128);
997	FREE(zero);
998	FREE(one);
999}
1000
1001static void
1002emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1003{
1004	struct nv50_program_exec *e = exec(pc);
1005
1006	set_long(pc, e);
1007	e->inst[0] |= 0xa0000000; /* delta */
1008	e->inst[1] |= (7 << 29); /* delta */
1009	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
1010	e->inst[1] |= (1 << 14); /* src .f32 */
1011	set_dst(pc, dst, e);
1012	set_src_0(pc, src, e);
1013
1014	emit(pc, e);
1015}
1016
1017static void
1018emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1019{
1020	struct nv50_program_exec *e;
1021	const int r_pred = 1;
1022
1023	/* Sets predicate reg ? */
1024	e = exec(pc);
1025	e->inst[0] = 0xa00001fd;
1026	e->inst[1] = 0xc4014788;
1027	set_src_0(pc, src, e);
1028	set_pred_wr(pc, 1, r_pred, e);
1029	if (src->neg)
1030		e->inst[1] |= 0x20000000;
1031	emit(pc, e);
1032
1033	/* This is probably KILP */
1034	e = exec(pc);
1035	e->inst[0] = 0x000001fe;
1036	set_long(pc, e);
1037	set_pred(pc, 1 /* LT? */, r_pred, e);
1038	emit(pc, e);
1039}
1040
1041static void
1042emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1043	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1044{
1045	struct nv50_reg *temp, *t[4];
1046	struct nv50_program_exec *e;
1047
1048	unsigned c, mode, dim;
1049
1050	switch (type) {
1051	case TGSI_TEXTURE_1D:
1052		dim = 1;
1053		break;
1054	case TGSI_TEXTURE_UNKNOWN:
1055	case TGSI_TEXTURE_2D:
1056	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1057	case TGSI_TEXTURE_RECT:
1058		dim = 2;
1059		break;
1060	case TGSI_TEXTURE_3D:
1061	case TGSI_TEXTURE_CUBE:
1062	case TGSI_TEXTURE_SHADOW2D:
1063	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1064		dim = 3;
1065		break;
1066	default:
1067		assert(0);
1068		break;
1069	}
1070
1071	/* some cards need t[0]'s hw index to be a multiple of 4 */
1072	alloc_temp4(pc, t, 0);
1073
1074	if (proj) {
1075		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1076			mode = pc->interp_mode[src[0]->index];
1077
1078			t[3]->rhw = src[3]->rhw;
1079			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1080			emit_flop(pc, 0, t[3], t[3]);
1081
1082			for (c = 0; c < dim; c++) {
1083				t[c]->rhw = src[c]->rhw;
1084				emit_interp(pc, t[c], t[3],
1085					    (mode | INTERP_PERSPECTIVE));
1086			}
1087		} else {
1088			emit_flop(pc, 0, t[3], src[3]);
1089			for (c = 0; c < dim; c++)
1090				emit_mul(pc, t[c], src[c], t[3]);
1091
1092			/* XXX: for some reason the blob sometimes uses MAD:
1093			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1094			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1095			 */
1096		}
1097	} else {
1098		if (type == TGSI_TEXTURE_CUBE) {
1099			temp = temp_temp(pc);
1100			emit_minmax(pc, 4, temp, src[0], src[1]);
1101			emit_minmax(pc, 4, temp, temp, src[2]);
1102			emit_flop(pc, 0, temp, temp);
1103			for (c = 0; c < 3; c++)
1104				emit_mul(pc, t[c], src[c], temp);
1105		} else {
1106			for (c = 0; c < dim; c++)
1107				emit_mov(pc, t[c], src[c]);
1108		}
1109	}
1110
1111	e = exec(pc);
1112	set_long(pc, e);
1113	e->inst[0] |= 0xf0000000;
1114	e->inst[1] |= 0x00000004;
1115	set_dst(pc, t[0], e);
1116	e->inst[0] |= (unit << 9);
1117
1118	if (dim == 2)
1119		e->inst[0] |= 0x00400000;
1120	else
1121	if (dim == 3)
1122		e->inst[0] |= 0x00800000;
1123
1124	e->inst[0] |= (mask & 0x3) << 25;
1125	e->inst[1] |= (mask & 0xc) << 12;
1126
1127	emit(pc, e);
1128
1129#if 1
1130	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1131	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1132	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1133	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1134
1135	free_temp4(pc, t);
1136#else
1137	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1138	 * the texture coordinates, not the fetched values: latency ? */
1139
1140	for (c = 0; c < 4; c++) {
1141		if (mask & (1 << c))
1142			assimilate_temp(pc, dst[c], t[c]);
1143		else
1144			free_temp(pc, t[c]);
1145	}
1146#endif
1147}
1148
1149static void
1150convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1151{
1152	unsigned q = 0, m = ~0;
1153
1154	assert(!is_long(e));
1155
1156	switch (e->inst[0] >> 28) {
1157	case 0x1:
1158		/* MOV */
1159		q = 0x0403c000;
1160		m = 0xffff7fff;
1161		break;
1162	case 0x8:
1163		/* INTERP (move centroid, perspective and flat bits) */
1164		m = ~0x03000100;
1165		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1166		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1167		break;
1168	case 0x9:
1169		/* RCP */
1170		break;
1171	case 0xB:
1172		/* ADD */
1173		m = ~(127 << 16);
1174		q = ((e->inst[0] & (~m)) >> 2);
1175		break;
1176	case 0xC:
1177		/* MUL */
1178		m = ~0x00008000;
1179		q = ((e->inst[0] & (~m)) << 12);
1180		break;
1181	case 0xE:
1182		/* MAD (if src2 == dst) */
1183		q = ((e->inst[0] & 0x1fc) << 12);
1184		break;
1185	default:
1186		assert(0);
1187		break;
1188	}
1189
1190	set_long(pc, e);
1191	pc->p->exec_size++;
1192
1193	e->inst[0] &= m;
1194	e->inst[1] |= q;
1195}
1196
1197static boolean
1198negate_supported(const struct tgsi_full_instruction *insn, int i)
1199{
1200	switch (insn->Instruction.Opcode) {
1201	case TGSI_OPCODE_DP3:
1202	case TGSI_OPCODE_DP4:
1203	case TGSI_OPCODE_MUL:
1204	case TGSI_OPCODE_KIL:
1205	case TGSI_OPCODE_ADD:
1206	case TGSI_OPCODE_SUB:
1207	case TGSI_OPCODE_MAD:
1208		return TRUE;
1209	case TGSI_OPCODE_POW:
1210		return (i == 1) ? TRUE : FALSE;
1211	default:
1212		return FALSE;
1213	}
1214}
1215
1216/* Return a read mask for source registers deduced from opcode & write mask. */
1217static unsigned
1218nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1219{
1220	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1221
1222	switch (insn->Instruction.Opcode) {
1223	case TGSI_OPCODE_COS:
1224	case TGSI_OPCODE_SIN:
1225		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1226	case TGSI_OPCODE_DP3:
1227		return 0x7;
1228	case TGSI_OPCODE_DP4:
1229	case TGSI_OPCODE_DPH:
1230	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1231		return 0xf;
1232	case TGSI_OPCODE_DST:
1233		return mask & (c ? 0xa : 0x6);
1234	case TGSI_OPCODE_EX2:
1235	case TGSI_OPCODE_LG2:
1236	case TGSI_OPCODE_POW:
1237	case TGSI_OPCODE_RCP:
1238	case TGSI_OPCODE_RSQ:
1239	case TGSI_OPCODE_SCS:
1240		return 0x1;
1241	case TGSI_OPCODE_LIT:
1242		return 0xb;
1243	case TGSI_OPCODE_TEX:
1244	case TGSI_OPCODE_TXP:
1245	{
1246		const struct tgsi_instruction_ext_texture *tex;
1247
1248		assert(insn->Instruction.Extended);
1249		tex = &insn->InstructionExtTexture;
1250
1251		mask = 0x7;
1252		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1253			mask |= 0x8;
1254
1255		switch (tex->Texture) {
1256		case TGSI_TEXTURE_1D:
1257			mask &= 0x9;
1258			break;
1259		case TGSI_TEXTURE_2D:
1260			mask &= 0xb;
1261			break;
1262		default:
1263			break;
1264		}
1265	}
1266		return mask;
1267	case TGSI_OPCODE_XPD:
1268		x = 0;
1269		if (mask & 1) x |= 0x6;
1270		if (mask & 2) x |= 0x5;
1271		if (mask & 4) x |= 0x3;
1272		return x;
1273	default:
1274		break;
1275	}
1276
1277	return mask;
1278}
1279
1280static struct nv50_reg *
1281tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1282{
1283	switch (dst->DstRegister.File) {
1284	case TGSI_FILE_TEMPORARY:
1285		return &pc->temp[dst->DstRegister.Index * 4 + c];
1286	case TGSI_FILE_OUTPUT:
1287		return &pc->result[dst->DstRegister.Index * 4 + c];
1288	case TGSI_FILE_NULL:
1289		return NULL;
1290	default:
1291		break;
1292	}
1293
1294	return NULL;
1295}
1296
1297static struct nv50_reg *
1298tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1299	 boolean neg)
1300{
1301	struct nv50_reg *r = NULL;
1302	struct nv50_reg *temp;
1303	unsigned sgn, c;
1304
1305	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1306
1307	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1308	switch (c) {
1309	case TGSI_EXTSWIZZLE_X:
1310	case TGSI_EXTSWIZZLE_Y:
1311	case TGSI_EXTSWIZZLE_Z:
1312	case TGSI_EXTSWIZZLE_W:
1313		switch (src->SrcRegister.File) {
1314		case TGSI_FILE_INPUT:
1315			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1316			break;
1317		case TGSI_FILE_TEMPORARY:
1318			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1319			break;
1320		case TGSI_FILE_CONSTANT:
1321			r = &pc->param[src->SrcRegister.Index * 4 + c];
1322			break;
1323		case TGSI_FILE_IMMEDIATE:
1324			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1325			break;
1326		case TGSI_FILE_SAMPLER:
1327			break;
1328		default:
1329			assert(0);
1330			break;
1331		}
1332		break;
1333	case TGSI_EXTSWIZZLE_ZERO:
1334		r = alloc_immd(pc, 0.0);
1335		return r;
1336	case TGSI_EXTSWIZZLE_ONE:
1337		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1338			return alloc_immd(pc, -1.0);
1339		return alloc_immd(pc, 1.0);
1340	default:
1341		assert(0);
1342		break;
1343	}
1344
1345	switch (sgn) {
1346	case TGSI_UTIL_SIGN_KEEP:
1347		break;
1348	case TGSI_UTIL_SIGN_CLEAR:
1349		temp = temp_temp(pc);
1350		emit_abs(pc, temp, r);
1351		r = temp;
1352		break;
1353	case TGSI_UTIL_SIGN_TOGGLE:
1354		if (neg)
1355			r->neg = 1;
1356		else {
1357			temp = temp_temp(pc);
1358			emit_neg(pc, temp, r);
1359			r = temp;
1360		}
1361		break;
1362	case TGSI_UTIL_SIGN_SET:
1363		temp = temp_temp(pc);
1364		emit_abs(pc, temp, r);
1365		if (neg)
1366			temp->neg = 1;
1367		else
1368			emit_neg(pc, temp, temp);
1369		r = temp;
1370		break;
1371	default:
1372		assert(0);
1373		break;
1374	}
1375
1376	return r;
1377}
1378
1379/* return TRUE for ops that produce only a single result */
1380static boolean
1381is_scalar_op(unsigned op)
1382{
1383	switch (op) {
1384	case TGSI_OPCODE_COS:
1385	case TGSI_OPCODE_DP2:
1386	case TGSI_OPCODE_DP3:
1387	case TGSI_OPCODE_DP4:
1388	case TGSI_OPCODE_DPH:
1389	case TGSI_OPCODE_EX2:
1390	case TGSI_OPCODE_LG2:
1391	case TGSI_OPCODE_POW:
1392	case TGSI_OPCODE_RCP:
1393	case TGSI_OPCODE_RSQ:
1394	case TGSI_OPCODE_SIN:
1395		/*
1396	case TGSI_OPCODE_KIL:
1397	case TGSI_OPCODE_LIT:
1398	case TGSI_OPCODE_SCS:
1399		*/
1400		return TRUE;
1401	default:
1402		return FALSE;
1403	}
1404}
1405
1406/* Returns a bitmask indicating which dst components depend
1407 * on source s, component c (reverse of nv50_tgsi_src_mask).
1408 */
1409static unsigned
1410nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1411{
1412	if (is_scalar_op(op))
1413		return 0x1;
1414
1415	switch (op) {
1416	case TGSI_OPCODE_DST:
1417		return (1 << c) & (s ? 0xa : 0x6);
1418	case TGSI_OPCODE_XPD:
1419		switch (c) {
1420		case 0: return 0x6;
1421		case 1: return 0x5;
1422		case 2: return 0x3;
1423		case 3: return 0x0;
1424		default:
1425			assert(0);
1426			return 0x0;
1427		}
1428	case TGSI_OPCODE_LIT:
1429	case TGSI_OPCODE_SCS:
1430	case TGSI_OPCODE_TEX:
1431	case TGSI_OPCODE_TXP:
1432		/* these take care of dangerous swizzles themselves */
1433		return 0x0;
1434	case TGSI_OPCODE_IF:
1435	case TGSI_OPCODE_KIL:
1436		/* don't call this function for these ops */
1437		assert(0);
1438		return 0;
1439	default:
1440		/* linear vector instruction */
1441		return (1 << c);
1442	}
1443}
1444
1445static boolean
1446nv50_program_tx_insn(struct nv50_pc *pc,
1447		     const struct tgsi_full_instruction *inst)
1448{
1449	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1450	unsigned mask, sat, unit;
1451	int i, c;
1452
1453	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1454	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1455
1456	memset(src, 0, sizeof(src));
1457
1458	for (c = 0; c < 4; c++) {
1459		if ((mask & (1 << c)) && !pc->r_dst[c])
1460			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1461		else
1462			dst[c] = pc->r_dst[c];
1463		rdst[c] = dst[c];
1464	}
1465
1466	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1467		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1468		unsigned src_mask;
1469		boolean neg_supp;
1470
1471		src_mask = nv50_tgsi_src_mask(inst, i);
1472		neg_supp = negate_supported(inst, i);
1473
1474		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1475			unit = fs->SrcRegister.Index;
1476
1477		for (c = 0; c < 4; c++)
1478			if (src_mask & (1 << c))
1479				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1480	}
1481
1482	brdc = temp = pc->r_brdc;
1483	if (brdc && brdc->type != P_TEMP) {
1484		temp = temp_temp(pc);
1485		if (sat)
1486			brdc = temp;
1487	} else
1488	if (sat) {
1489		for (c = 0; c < 4; c++) {
1490			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1491				continue;
1492			rdst[c] = dst[c];
1493			dst[c] = temp_temp(pc);
1494		}
1495	}
1496
1497	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1498
1499	switch (inst->Instruction.Opcode) {
1500	case TGSI_OPCODE_ABS:
1501		for (c = 0; c < 4; c++) {
1502			if (!(mask & (1 << c)))
1503				continue;
1504			emit_abs(pc, dst[c], src[0][c]);
1505		}
1506		break;
1507	case TGSI_OPCODE_ADD:
1508		for (c = 0; c < 4; c++) {
1509			if (!(mask & (1 << c)))
1510				continue;
1511			emit_add(pc, dst[c], src[0][c], src[1][c]);
1512		}
1513		break;
1514	case TGSI_OPCODE_CEIL:
1515		for (c = 0; c < 4; c++) {
1516			if (!(mask & (1 << c)))
1517				continue;
1518			emit_cvt(pc, dst[c], src[0][c], -1,
1519				 CVTOP_CEIL, CVT_F32_F32);
1520		}
1521		break;
1522	case TGSI_OPCODE_COS:
1523		if (mask & 8) {
1524			emit_precossin(pc, temp, src[0][3]);
1525			emit_flop(pc, 5, dst[3], temp);
1526			if (!(mask &= 7))
1527				break;
1528			if (temp == dst[3])
1529				temp = brdc = temp_temp(pc);
1530		}
1531		emit_precossin(pc, temp, src[0][0]);
1532		emit_flop(pc, 5, brdc, temp);
1533		break;
1534	case TGSI_OPCODE_DP3:
1535		emit_mul(pc, temp, src[0][0], src[1][0]);
1536		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1537		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1538		break;
1539	case TGSI_OPCODE_DP4:
1540		emit_mul(pc, temp, src[0][0], src[1][0]);
1541		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1542		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1543		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1544		break;
1545	case TGSI_OPCODE_DPH:
1546		emit_mul(pc, temp, src[0][0], src[1][0]);
1547		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1548		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1549		emit_add(pc, brdc, src[1][3], temp);
1550		break;
1551	case TGSI_OPCODE_DST:
1552		if (mask & (1 << 1))
1553			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1554		if (mask & (1 << 2))
1555			emit_mov(pc, dst[2], src[0][2]);
1556		if (mask & (1 << 3))
1557			emit_mov(pc, dst[3], src[1][3]);
1558		if (mask & (1 << 0))
1559			emit_mov_immdval(pc, dst[0], 1.0f);
1560		break;
1561	case TGSI_OPCODE_EX2:
1562		emit_preex2(pc, temp, src[0][0]);
1563		emit_flop(pc, 6, brdc, temp);
1564		break;
1565	case TGSI_OPCODE_FLR:
1566		for (c = 0; c < 4; c++) {
1567			if (!(mask & (1 << c)))
1568				continue;
1569			emit_flr(pc, dst[c], src[0][c]);
1570		}
1571		break;
1572	case TGSI_OPCODE_FRC:
1573		temp = temp_temp(pc);
1574		for (c = 0; c < 4; c++) {
1575			if (!(mask & (1 << c)))
1576				continue;
1577			emit_flr(pc, temp, src[0][c]);
1578			emit_sub(pc, dst[c], src[0][c], temp);
1579		}
1580		break;
1581	case TGSI_OPCODE_KIL:
1582		emit_kil(pc, src[0][0]);
1583		emit_kil(pc, src[0][1]);
1584		emit_kil(pc, src[0][2]);
1585		emit_kil(pc, src[0][3]);
1586		break;
1587	case TGSI_OPCODE_LIT:
1588		emit_lit(pc, &dst[0], mask, &src[0][0]);
1589		break;
1590	case TGSI_OPCODE_LG2:
1591		emit_flop(pc, 3, brdc, src[0][0]);
1592		break;
1593	case TGSI_OPCODE_LRP:
1594		temp = temp_temp(pc);
1595		for (c = 0; c < 4; c++) {
1596			if (!(mask & (1 << c)))
1597				continue;
1598			emit_sub(pc, temp, src[1][c], src[2][c]);
1599			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1600		}
1601		break;
1602	case TGSI_OPCODE_MAD:
1603		for (c = 0; c < 4; c++) {
1604			if (!(mask & (1 << c)))
1605				continue;
1606			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1607		}
1608		break;
1609	case TGSI_OPCODE_MAX:
1610		for (c = 0; c < 4; c++) {
1611			if (!(mask & (1 << c)))
1612				continue;
1613			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1614		}
1615		break;
1616	case TGSI_OPCODE_MIN:
1617		for (c = 0; c < 4; c++) {
1618			if (!(mask & (1 << c)))
1619				continue;
1620			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1621		}
1622		break;
1623	case TGSI_OPCODE_MOV:
1624	case TGSI_OPCODE_SWZ:
1625		for (c = 0; c < 4; c++) {
1626			if (!(mask & (1 << c)))
1627				continue;
1628			emit_mov(pc, dst[c], src[0][c]);
1629		}
1630		break;
1631	case TGSI_OPCODE_MUL:
1632		for (c = 0; c < 4; c++) {
1633			if (!(mask & (1 << c)))
1634				continue;
1635			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1636		}
1637		break;
1638	case TGSI_OPCODE_POW:
1639		emit_pow(pc, brdc, src[0][0], src[1][0]);
1640		break;
1641	case TGSI_OPCODE_RCP:
1642		emit_flop(pc, 0, brdc, src[0][0]);
1643		break;
1644	case TGSI_OPCODE_RSQ:
1645		emit_flop(pc, 2, brdc, src[0][0]);
1646		break;
1647	case TGSI_OPCODE_SCS:
1648		temp = temp_temp(pc);
1649		if (mask & 3)
1650			emit_precossin(pc, temp, src[0][0]);
1651		if (mask & (1 << 0))
1652			emit_flop(pc, 5, dst[0], temp);
1653		if (mask & (1 << 1))
1654			emit_flop(pc, 4, dst[1], temp);
1655		if (mask & (1 << 2))
1656			emit_mov_immdval(pc, dst[2], 0.0);
1657		if (mask & (1 << 3))
1658			emit_mov_immdval(pc, dst[3], 1.0);
1659		break;
1660	case TGSI_OPCODE_SIN:
1661		if (mask & 8) {
1662			emit_precossin(pc, temp, src[0][3]);
1663			emit_flop(pc, 4, dst[3], temp);
1664			if (!(mask &= 7))
1665				break;
1666			if (temp == dst[3])
1667				temp = brdc = temp_temp(pc);
1668		}
1669		emit_precossin(pc, temp, src[0][0]);
1670		emit_flop(pc, 4, brdc, temp);
1671		break;
1672	case TGSI_OPCODE_SLT:
1673	case TGSI_OPCODE_SGE:
1674	case TGSI_OPCODE_SEQ:
1675	case TGSI_OPCODE_SGT:
1676	case TGSI_OPCODE_SLE:
1677	case TGSI_OPCODE_SNE:
1678		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1679		for (c = 0; c < 4; c++) {
1680			if (!(mask & (1 << c)))
1681				continue;
1682			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1683		}
1684		break;
1685	case TGSI_OPCODE_SUB:
1686		for (c = 0; c < 4; c++) {
1687			if (!(mask & (1 << c)))
1688				continue;
1689			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1690		}
1691		break;
1692	case TGSI_OPCODE_TEX:
1693		emit_tex(pc, dst, mask, src[0], unit,
1694			 inst->InstructionExtTexture.Texture, FALSE);
1695		break;
1696	case TGSI_OPCODE_TXP:
1697		emit_tex(pc, dst, mask, src[0], unit,
1698			 inst->InstructionExtTexture.Texture, TRUE);
1699		break;
1700	case TGSI_OPCODE_TRUNC:
1701		for (c = 0; c < 4; c++) {
1702			if (!(mask & (1 << c)))
1703				continue;
1704			emit_cvt(pc, dst[c], src[0][c], -1,
1705				 CVTOP_TRUNC, CVT_F32_F32);
1706		}
1707		break;
1708	case TGSI_OPCODE_XPD:
1709		temp = temp_temp(pc);
1710		if (mask & (1 << 0)) {
1711			emit_mul(pc, temp, src[0][2], src[1][1]);
1712			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1713		}
1714		if (mask & (1 << 1)) {
1715			emit_mul(pc, temp, src[0][0], src[1][2]);
1716			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1717		}
1718		if (mask & (1 << 2)) {
1719			emit_mul(pc, temp, src[0][1], src[1][0]);
1720			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1721		}
1722		if (mask & (1 << 3))
1723			emit_mov_immdval(pc, dst[3], 1.0);
1724		break;
1725	case TGSI_OPCODE_END:
1726		break;
1727	default:
1728		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1729		return FALSE;
1730	}
1731
1732	if (brdc) {
1733		if (sat)
1734			emit_sat(pc, brdc, brdc);
1735		for (c = 0; c < 4; c++)
1736			if ((mask & (1 << c)) && dst[c] != brdc)
1737				emit_mov(pc, dst[c], brdc);
1738	} else
1739	if (sat) {
1740		for (c = 0; c < 4; c++) {
1741			if (!(mask & (1 << c)))
1742				continue;
1743			/* in this case we saturate later */
1744			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1745				continue;
1746			emit_sat(pc, rdst[c], dst[c]);
1747		}
1748	}
1749
1750	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1751		for (c = 0; c < 4; c++) {
1752			if (!src[i][c])
1753				continue;
1754			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1755				FREE(src[i][c]);
1756		}
1757	}
1758
1759	kill_temp_temp(pc);
1760	return TRUE;
1761}
1762
1763static void
1764prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
1765{
1766	struct nv50_reg *reg = NULL;
1767	const struct tgsi_full_src_register *src;
1768	const struct tgsi_dst_register *dst;
1769	unsigned i, c, k, mask;
1770
1771	dst = &insn->FullDstRegisters[0].DstRegister;
1772	mask = dst->WriteMask;
1773
1774        if (dst->File == TGSI_FILE_TEMPORARY)
1775                reg = pc->temp;
1776        else
1777        if (dst->File == TGSI_FILE_OUTPUT)
1778                reg = pc->result;
1779
1780	if (reg) {
1781		for (c = 0; c < 4; c++) {
1782			if (!(mask & (1 << c)))
1783				continue;
1784			reg[dst->Index * 4 + c].acc = pc->insn_nr;
1785		}
1786	}
1787
1788	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1789		src = &insn->FullSrcRegisters[i];
1790
1791		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
1792			reg = pc->temp;
1793		else
1794		if (src->SrcRegister.File == TGSI_FILE_INPUT)
1795			reg = pc->attr;
1796		else
1797			continue;
1798
1799		mask = nv50_tgsi_src_mask(insn, i);
1800
1801		for (c = 0; c < 4; c++) {
1802			if (!(mask & (1 << c)))
1803				continue;
1804			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1805
1806			if (k > TGSI_EXTSWIZZLE_W)
1807				continue;
1808
1809			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
1810		}
1811	}
1812}
1813
1814/* Returns a bitmask indicating which dst components need to be
1815 * written to temporaries first to avoid 'corrupting' sources.
1816 *
1817 * m[i]   (out) indicate component to write in the i-th position
1818 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1819 */
1820static unsigned
1821nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1822{
1823	unsigned i, c, x, unsafe;
1824
1825	for (c = 0; c < 4; c++)
1826		m[c] = c;
1827
1828	/* Swap as long as a dst component written earlier is depended on
1829	 * by one written later, but the next one isn't depended on by it.
1830	 */
1831	for (c = 0; c < 3; c++) {
1832		if (rdep[m[c + 1]] & (1 << m[c]))
1833			continue; /* if next one is depended on by us */
1834		for (i = c + 1; i < 4; i++)
1835			/* if we are depended on by a later one */
1836			if (rdep[m[c]] & (1 << m[i]))
1837				break;
1838		if (i == 4)
1839			continue;
1840		/* now, swap */
1841		x = m[c];
1842		m[c] = m[c + 1];
1843		m[c + 1] = x;
1844
1845		/* restart */
1846		c = 0;
1847	}
1848
1849	/* mark dependencies that could not be resolved by reordering */
1850	for (i = 0; i < 3; ++i)
1851		for (c = i + 1; c < 4; ++c)
1852			if (rdep[m[i]] & (1 << m[c]))
1853				unsafe |= (1 << i);
1854
1855	/* NOTE: $unsafe is with respect to order, not component */
1856	return unsafe;
1857}
1858
1859/* Select a suitable dst register for broadcasting scalar results,
1860 * or return NULL if we have to allocate an extra TEMP.
1861 *
1862 * If e.g. only 1 component is written, we may also emit the final
1863 * result to a write-only register.
1864 */
1865static struct nv50_reg *
1866tgsi_broadcast_dst(struct nv50_pc *pc,
1867		   const struct tgsi_full_dst_register *fd, unsigned mask)
1868{
1869	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1870		int c = ffs(~mask & fd->DstRegister.WriteMask);
1871		if (c)
1872			return tgsi_dst(pc, c - 1, fd);
1873	} else {
1874		int c = ffs(fd->DstRegister.WriteMask) - 1;
1875		if ((1 << c) == fd->DstRegister.WriteMask)
1876			return tgsi_dst(pc, c, fd);
1877	}
1878
1879	return NULL;
1880}
1881
1882/* Scan source swizzles and return a bitmask indicating dst regs that
1883 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1884 */
1885static unsigned
1886nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1887		       unsigned rdep[4])
1888{
1889	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1890	const struct tgsi_full_src_register *fs;
1891	unsigned i, deqs = 0;
1892
1893	for (i = 0; i < 4; ++i)
1894		rdep[i] = 0;
1895
1896	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1897		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1898		boolean neg_supp = negate_supported(insn, i);
1899
1900		fs = &insn->FullSrcRegisters[i];
1901		if (fs->SrcRegister.File != fd->DstRegister.File ||
1902		    fs->SrcRegister.Index != fd->DstRegister.Index)
1903			continue;
1904
1905		for (chn = 0; chn < 4; ++chn) {
1906			unsigned s, c;
1907
1908			if (!(mask & (1 << chn))) /* src is not read */
1909				continue;
1910			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1911			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1912
1913			if (c > TGSI_EXTSWIZZLE_W ||
1914			    !(fd->DstRegister.WriteMask & (1 << c)))
1915				continue;
1916
1917			/* no danger if src is copied to TEMP first */
1918			if ((s != TGSI_UTIL_SIGN_KEEP) &&
1919			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1920				continue;
1921
1922			rdep[c] |= nv50_tgsi_dst_revdep(
1923				insn->Instruction.Opcode, i, chn);
1924			deqs |= (1 << c);
1925		}
1926	}
1927
1928	return deqs;
1929}
1930
1931static boolean
1932nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1933{
1934	struct tgsi_full_instruction insn = tok->FullInstruction;
1935	const struct tgsi_full_dst_register *fd;
1936	unsigned i, deqs, rdep[4], m[4];
1937
1938	fd = &tok->FullInstruction.FullDstRegisters[0];
1939	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
1940
1941	if (is_scalar_op(insn.Instruction.Opcode)) {
1942		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
1943		if (!pc->r_brdc)
1944			pc->r_brdc = temp_temp(pc);
1945		return nv50_program_tx_insn(pc, &insn);
1946	}
1947	pc->r_brdc = NULL;
1948
1949	if (!deqs)
1950		return nv50_program_tx_insn(pc, &insn);
1951
1952	deqs = nv50_revdep_reorder(m, rdep);
1953
1954	for (i = 0; i < 4; ++i) {
1955		assert(pc->r_dst[m[i]] == NULL);
1956
1957		insn.FullDstRegisters[0].DstRegister.WriteMask =
1958			fd->DstRegister.WriteMask & (1 << m[i]);
1959
1960		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
1961			continue;
1962
1963		if (deqs & (1 << i))
1964			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
1965
1966		if (!nv50_program_tx_insn(pc, &insn))
1967			return FALSE;
1968	}
1969
1970	for (i = 0; i < 4; i++) {
1971		struct nv50_reg *reg = pc->r_dst[i];
1972		if (!reg)
1973			continue;
1974		pc->r_dst[i] = NULL;
1975
1976		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
1977			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
1978		else
1979			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
1980		free_temp(pc, reg);
1981	}
1982
1983	return TRUE;
1984}
1985
1986static void
1987load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
1988{
1989	struct nv50_reg *iv, **ppiv;
1990	unsigned mode = pc->interp_mode[reg->index];
1991
1992	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
1993	iv = *ppiv;
1994
1995	if ((mode & INTERP_PERSPECTIVE) && !iv) {
1996		iv = *ppiv = alloc_temp(pc, NULL);
1997		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
1998
1999		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2000		emit_flop(pc, 0, iv, iv);
2001
2002		/* XXX: when loading interpolants dynamically, move these
2003		 * to the program head, or make sure it can't be skipped.
2004		 */
2005	}
2006
2007	emit_interp(pc, reg, iv, mode);
2008}
2009
2010static boolean
2011nv50_program_tx_prep(struct nv50_pc *pc)
2012{
2013	struct tgsi_parse_context tp;
2014	struct nv50_program *p = pc->p;
2015	boolean ret = FALSE;
2016	unsigned i, c, flat_nr = 0;
2017
2018	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2019	while (!tgsi_parse_end_of_tokens(&tp)) {
2020		const union tgsi_full_token *tok = &tp.FullToken;
2021
2022		tgsi_parse_token(&tp);
2023		switch (tok->Token.Type) {
2024		case TGSI_TOKEN_TYPE_IMMEDIATE:
2025		{
2026			const struct tgsi_full_immediate *imm =
2027				&tp.FullToken.FullImmediate;
2028
2029			ctor_immd(pc, imm->u[0].Float,
2030				      imm->u[1].Float,
2031				      imm->u[2].Float,
2032				      imm->u[3].Float);
2033		}
2034			break;
2035		case TGSI_TOKEN_TYPE_DECLARATION:
2036		{
2037			const struct tgsi_full_declaration *d;
2038			unsigned si, last, first, mode;
2039
2040			d = &tp.FullToken.FullDeclaration;
2041			first = d->DeclarationRange.First;
2042			last = d->DeclarationRange.Last;
2043
2044			switch (d->Declaration.File) {
2045			case TGSI_FILE_TEMPORARY:
2046				break;
2047			case TGSI_FILE_OUTPUT:
2048				if (!d->Declaration.Semantic ||
2049				    p->type == PIPE_SHADER_FRAGMENT)
2050					break;
2051
2052				si = d->Semantic.SemanticIndex;
2053				switch (d->Semantic.SemanticName) {
2054				case TGSI_SEMANTIC_BCOLOR:
2055					p->cfg.two_side[si].hw = first;
2056					if (p->cfg.io_nr > first)
2057						p->cfg.io_nr = first;
2058					break;
2059					/*
2060				case TGSI_SEMANTIC_CLIP_DISTANCE:
2061					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2062					break;
2063					*/
2064				default:
2065					break;
2066				}
2067				break;
2068			case TGSI_FILE_INPUT:
2069			{
2070				if (p->type != PIPE_SHADER_FRAGMENT)
2071					break;
2072
2073				switch (d->Declaration.Interpolate) {
2074				case TGSI_INTERPOLATE_CONSTANT:
2075					mode = INTERP_FLAT;
2076					flat_nr++;
2077					break;
2078				case TGSI_INTERPOLATE_PERSPECTIVE:
2079					mode = INTERP_PERSPECTIVE;
2080					p->cfg.regs[1] |= 0x08 << 24;
2081					break;
2082				default:
2083					mode = INTERP_LINEAR;
2084					break;
2085				}
2086				if (d->Declaration.Centroid)
2087					mode |= INTERP_CENTROID;
2088
2089				assert(last < 32);
2090				for (i = first; i <= last; i++)
2091					pc->interp_mode[i] = mode;
2092			}
2093				break;
2094			case TGSI_FILE_CONSTANT:
2095				break;
2096			case TGSI_FILE_SAMPLER:
2097				break;
2098			default:
2099				NOUVEAU_ERR("bad decl file %d\n",
2100					    d->Declaration.File);
2101				goto out_err;
2102			}
2103		}
2104			break;
2105		case TGSI_TOKEN_TYPE_INSTRUCTION:
2106			pc->insn_nr++;
2107			prep_inspect_insn(pc, &tok->FullInstruction);
2108			break;
2109		default:
2110			break;
2111		}
2112	}
2113
2114	if (p->type == PIPE_SHADER_VERTEX) {
2115		int rid = 0;
2116
2117		for (i = 0; i < pc->attr_nr * 4; ++i) {
2118			if (pc->attr[i].acc) {
2119				pc->attr[i].hw = rid++;
2120				p->cfg.attr[i / 32] |= 1 << (i % 32);
2121			}
2122		}
2123
2124		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2125			p->cfg.io[i].hw = rid;
2126			p->cfg.io[i].id_vp = i;
2127
2128			for (c = 0; c < 4; ++c) {
2129				int n = i * 4 + c;
2130				if (!pc->result[n].acc)
2131					continue;
2132				pc->result[n].hw = rid++;
2133				p->cfg.io[i].mask |= 1 << c;
2134			}
2135		}
2136
2137		for (c = 0; c < 2; ++c)
2138			if (p->cfg.two_side[c].hw < 0x40)
2139				p->cfg.two_side[c] = p->cfg.io[
2140					p->cfg.two_side[c].hw];
2141	} else
2142	if (p->type == PIPE_SHADER_FRAGMENT) {
2143		int rid, aid;
2144		unsigned n = 0, m = pc->attr_nr - flat_nr;
2145
2146		int base = (TGSI_SEMANTIC_POSITION ==
2147			    p->info.input_semantic_name[0]) ? 0 : 1;
2148
2149		/* non-flat interpolants have to be mapped to
2150		 * the lower hardware IDs, so sort them:
2151		 */
2152		for (i = 0; i < pc->attr_nr; i++) {
2153			if (pc->interp_mode[i] == INTERP_FLAT) {
2154				p->cfg.io[m].id_vp = i + base;
2155				p->cfg.io[m++].id_fp = i;
2156			} else {
2157				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2158					p->cfg.io[n].linear = TRUE;
2159				p->cfg.io[n].id_vp = i + base;
2160				p->cfg.io[n++].id_fp = i;
2161			}
2162		}
2163
2164		if (!base) /* set w-coordinate mask from perspective interp */
2165			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2166
2167		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2168			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2169
2170		for (n = 0; n < pc->attr_nr; ++n) {
2171			p->cfg.io[n].hw = rid = aid;
2172			i = p->cfg.io[n].id_fp;
2173
2174			for (c = 0; c < 4; ++c) {
2175				if (!pc->attr[i * 4 + c].acc)
2176					continue;
2177				pc->attr[i * 4 + c].rhw = rid++;
2178				p->cfg.io[n].mask |= 1 << c;
2179
2180				load_interpolant(pc, &pc->attr[i * 4 + c]);
2181			}
2182			aid += popcnt4(p->cfg.io[n].mask);
2183		}
2184
2185		if (!base)
2186			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2187
2188		m = popcnt4(p->cfg.regs[1] >> 24);
2189
2190		/* set count of non-position inputs and of non-flat
2191		 * non-position inputs for FP_INTERPOLANT_CTRL
2192		 */
2193		p->cfg.regs[1] |= aid - m;
2194
2195		if (flat_nr) {
2196			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2197			p->cfg.regs[1] |= (i - m) << 16;
2198		} else
2199			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2200
2201		/* mark color semantic for light-twoside */
2202		n = 0x40;
2203		for (i = 0; i < pc->attr_nr; i++) {
2204			ubyte si, sn;
2205
2206			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2207			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2208
2209			if (sn == TGSI_SEMANTIC_COLOR) {
2210				p->cfg.two_side[si] = p->cfg.io[i];
2211
2212				/* increase colour count */
2213				p->cfg.regs[0] += popcnt4(
2214					p->cfg.two_side[si].mask) << 16;
2215
2216				n = MIN2(n, p->cfg.io[i].hw - m);
2217			}
2218		}
2219		if (n < 0x40)
2220			p->cfg.regs[0] += n;
2221
2222		/* Initialize FP results:
2223		 * FragDepth is always first TGSI and last hw output
2224		 */
2225		i = p->info.writes_z ? 4 : 0;
2226		for (rid = 0; i < pc->result_nr * 4; i++)
2227			pc->result[i].rhw = rid++;
2228		if (p->info.writes_z)
2229			pc->result[2].rhw = rid;
2230	}
2231
2232	if (pc->immd_nr) {
2233		int rid = 0;
2234
2235		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2236		if (!pc->immd)
2237			goto out_err;
2238
2239		for (i = 0; i < pc->immd_nr; i++) {
2240			for (c = 0; c < 4; c++, rid++)
2241				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2242		}
2243	}
2244
2245	ret = TRUE;
2246out_err:
2247	if (pc->iv_p)
2248		free_temp(pc, pc->iv_p);
2249	if (pc->iv_c)
2250		free_temp(pc, pc->iv_c);
2251
2252	tgsi_parse_free(&tp);
2253	return ret;
2254}
2255
2256static void
2257free_nv50_pc(struct nv50_pc *pc)
2258{
2259	if (pc->immd)
2260		FREE(pc->immd);
2261	if (pc->param)
2262		FREE(pc->param);
2263	if (pc->result)
2264		FREE(pc->result);
2265	if (pc->attr)
2266		FREE(pc->attr);
2267	if (pc->temp)
2268		FREE(pc->temp);
2269
2270	FREE(pc);
2271}
2272
2273static boolean
2274ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2275{
2276	int i, c;
2277	unsigned rtype[2] = { P_ATTR, P_RESULT };
2278
2279	pc->p = p;
2280	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2281	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2282	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2283	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2284
2285	p->cfg.high_temp = 4;
2286
2287	p->cfg.two_side[0].hw = 0x40;
2288	p->cfg.two_side[1].hw = 0x40;
2289
2290	switch (p->type) {
2291	case PIPE_SHADER_VERTEX:
2292		p->cfg.clpd = 0x40;
2293		p->cfg.io_nr = pc->result_nr;
2294		break;
2295	case PIPE_SHADER_FRAGMENT:
2296		rtype[0] = rtype[1] = P_TEMP;
2297
2298		p->cfg.regs[0] = 0x01000004;
2299		p->cfg.io_nr = pc->attr_nr;
2300
2301		if (p->info.writes_z) {
2302			p->cfg.regs[2] |= 0x00000100;
2303			p->cfg.regs[3] |= 0x00000011;
2304		}
2305		if (p->info.uses_kill)
2306			p->cfg.regs[2] |= 0x00100000;
2307		break;
2308	}
2309
2310	if (pc->temp_nr) {
2311		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2312		if (!pc->temp)
2313			return FALSE;
2314
2315		for (i = 0; i < pc->temp_nr * 4; ++i)
2316			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2317	}
2318
2319	if (pc->attr_nr) {
2320		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2321		if (!pc->attr)
2322			return FALSE;
2323
2324		for (i = 0; i < pc->attr_nr * 4; ++i)
2325			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2326	}
2327
2328	if (pc->result_nr) {
2329		unsigned nr = pc->result_nr * 4;
2330
2331		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2332		if (!pc->result)
2333			return FALSE;
2334
2335		for (i = 0; i < nr; ++i)
2336			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2337	}
2338
2339	if (pc->param_nr) {
2340		int rid = 0;
2341
2342		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2343		if (!pc->param)
2344			return FALSE;
2345
2346		for (i = 0; i < pc->param_nr; ++i)
2347			for (c = 0; c < 4; ++c, ++rid)
2348				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2349	}
2350
2351	return TRUE;
2352}
2353
2354static boolean
2355nv50_program_tx(struct nv50_program *p)
2356{
2357	struct tgsi_parse_context parse;
2358	struct nv50_pc *pc;
2359	unsigned k;
2360	boolean ret;
2361
2362	pc = CALLOC_STRUCT(nv50_pc);
2363	if (!pc)
2364		return FALSE;
2365
2366	ret = ctor_nv50_pc(pc, p);
2367	if (ret == FALSE)
2368		goto out_cleanup;
2369
2370	ret = nv50_program_tx_prep(pc);
2371	if (ret == FALSE)
2372		goto out_cleanup;
2373
2374	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2375	while (!tgsi_parse_end_of_tokens(&parse)) {
2376		const union tgsi_full_token *tok = &parse.FullToken;
2377
2378		/* don't allow half insn/immd on first and last instruction */
2379		pc->allow32 = TRUE;
2380		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2381			pc->allow32 = FALSE;
2382
2383		tgsi_parse_token(&parse);
2384
2385		switch (tok->Token.Type) {
2386		case TGSI_TOKEN_TYPE_INSTRUCTION:
2387			++pc->insn_cur;
2388			ret = nv50_tgsi_insn(pc, tok);
2389			if (ret == FALSE)
2390				goto out_err;
2391			break;
2392		default:
2393			break;
2394		}
2395	}
2396
2397	if (p->type == PIPE_SHADER_FRAGMENT) {
2398		struct nv50_reg out;
2399		ctor_reg(&out, P_TEMP, -1, -1);
2400
2401		for (k = 0; k < pc->result_nr * 4; k++) {
2402			if (pc->result[k].rhw == -1)
2403				continue;
2404			if (pc->result[k].hw != pc->result[k].rhw) {
2405				out.hw = pc->result[k].rhw;
2406				emit_mov(pc, &out, &pc->result[k]);
2407			}
2408			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2409				pc->p->cfg.high_result = pc->result[k].rhw + 1;
2410		}
2411	}
2412
2413	/* look for single half instructions and make them long */
2414	struct nv50_program_exec *e, *e_prev;
2415
2416	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2417		if (!is_long(e))
2418			k++;
2419
2420		if (!e->next || is_long(e->next)) {
2421			if (k & 1)
2422				convert_to_long(pc, e);
2423			k = 0;
2424		}
2425
2426		if (e->next)
2427			e_prev = e;
2428	}
2429
2430	if (!is_long(pc->p->exec_tail)) {
2431		/* this may occur if moving FP results */
2432		assert(e_prev && !is_long(e_prev));
2433		convert_to_long(pc, e_prev);
2434		convert_to_long(pc, pc->p->exec_tail);
2435	}
2436
2437	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2438	pc->p->exec_tail->inst[1] |= 0x00000001;
2439
2440	p->param_nr = pc->param_nr * 4;
2441	p->immd_nr = pc->immd_nr * 4;
2442	p->immd = pc->immd_buf;
2443
2444out_err:
2445	tgsi_parse_free(&parse);
2446
2447out_cleanup:
2448	free_nv50_pc(pc);
2449	return ret;
2450}
2451
2452static void
2453nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2454{
2455	if (nv50_program_tx(p) == FALSE)
2456		assert(0);
2457	p->translated = TRUE;
2458}
2459
2460static void
2461nv50_program_upload_data(struct nv50_context *nv50, float *map,
2462			unsigned start, unsigned count, unsigned cbuf)
2463{
2464	struct nouveau_channel *chan = nv50->screen->base.channel;
2465	struct nouveau_grobj *tesla = nv50->screen->tesla;
2466
2467	while (count) {
2468		unsigned nr = count > 2047 ? 2047 : count;
2469
2470		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2471		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2472		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2473		OUT_RINGp (chan, map, nr);
2474
2475		map += nr;
2476		start += nr;
2477		count -= nr;
2478	}
2479}
2480
2481static void
2482nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2483{
2484	struct pipe_screen *pscreen = nv50->pipe.screen;
2485
2486	if (!p->data[0] && p->immd_nr) {
2487		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2488
2489		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2490			while (heap->next && heap->size < p->immd_nr) {
2491				struct nv50_program *evict = heap->next->priv;
2492				nouveau_resource_free(&evict->data[0]);
2493			}
2494
2495			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2496						   &p->data[0]))
2497				assert(0);
2498		}
2499
2500		/* immediates only need to be uploaded again when freed */
2501		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2502					 p->immd_nr, NV50_CB_PMISC);
2503	}
2504
2505	if (!p->data[1] && p->param_nr) {
2506		struct nouveau_resource *heap =
2507			nv50->screen->parm_heap[p->type];
2508
2509		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
2510			while (heap->next && heap->size < p->param_nr) {
2511				struct nv50_program *evict = heap->next->priv;
2512				nouveau_resource_free(&evict->data[1]);
2513			}
2514
2515			if (nouveau_resource_alloc(heap, p->param_nr, p,
2516						   &p->data[1]))
2517				assert(0);
2518		}
2519	}
2520
2521	if (p->param_nr) {
2522		unsigned cbuf = NV50_CB_PVP;
2523		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2524					     PIPE_BUFFER_USAGE_CPU_READ);
2525		if (p->type == PIPE_SHADER_FRAGMENT)
2526			cbuf = NV50_CB_PFP;
2527		nv50_program_upload_data(nv50, map, p->data[1]->start,
2528					 p->param_nr, cbuf);
2529		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2530	}
2531}
2532
2533static void
2534nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2535{
2536	struct nouveau_channel *chan = nv50->screen->base.channel;
2537	struct nouveau_grobj *tesla = nv50->screen->tesla;
2538	struct nv50_program_exec *e;
2539	struct nouveau_stateobj *so;
2540	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2541	unsigned start, count, *up, *ptr;
2542	boolean upload = FALSE;
2543
2544	if (!p->bo) {
2545		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2546			       p->exec_size * 4, &p->bo);
2547		upload = TRUE;
2548	}
2549
2550	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2551		(p->data[1] && p->data[1]->start != p->data_start[1])) {
2552		for (e = p->exec_head; e; e = e->next) {
2553			unsigned ei, ci, bs;
2554
2555			if (e->param.index < 0)
2556				continue;
2557			bs = (e->inst[1] >> 22) & 0x07;
2558			assert(bs < 2);
2559			ei = e->param.shift >> 5;
2560			ci = e->param.index + p->data[bs]->start;
2561
2562			e->inst[ei] &= ~e->param.mask;
2563			e->inst[ei] |= (ci << e->param.shift);
2564		}
2565
2566		if (p->data[0])
2567			p->data_start[0] = p->data[0]->start;
2568		if (p->data[1])
2569			p->data_start[1] = p->data[1]->start;
2570
2571		upload = TRUE;
2572	}
2573
2574	if (!upload)
2575		return;
2576
2577#ifdef NV50_PROGRAM_DUMP
2578	NOUVEAU_ERR("-------\n");
2579	for (e = p->exec_head; e; e = e->next) {
2580		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2581		if (is_long(e))
2582			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2583	}
2584#endif
2585
2586	up = ptr = MALLOC(p->exec_size * 4);
2587	for (e = p->exec_head; e; e = e->next) {
2588		*(ptr++) = e->inst[0];
2589		if (is_long(e))
2590			*(ptr++) = e->inst[1];
2591	}
2592
2593	so = so_new(4,2);
2594	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2595	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2596	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2597	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2598
2599	start = 0; count = p->exec_size;
2600	while (count) {
2601		struct nouveau_channel *chan = nv50->screen->base.channel;
2602		unsigned nr;
2603
2604		so_emit(chan, so);
2605
2606		nr = MIN2(count, 2047);
2607		nr = MIN2(chan->pushbuf->remaining, nr);
2608		if (chan->pushbuf->remaining < (nr + 3)) {
2609			FIRE_RING(chan);
2610			continue;
2611		}
2612
2613		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2614		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2615		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2616		OUT_RINGp (chan, up + start, nr);
2617
2618		start += nr;
2619		count -= nr;
2620	}
2621
2622	FREE(up);
2623	so_ref(NULL, &so);
2624}
2625
2626void
2627nv50_vertprog_validate(struct nv50_context *nv50)
2628{
2629	struct nouveau_grobj *tesla = nv50->screen->tesla;
2630	struct nv50_program *p = nv50->vertprog;
2631	struct nouveau_stateobj *so;
2632
2633	if (!p->translated) {
2634		nv50_program_validate(nv50, p);
2635		if (!p->translated)
2636			assert(0);
2637	}
2638
2639	nv50_program_validate_data(nv50, p);
2640	nv50_program_validate_code(nv50, p);
2641
2642	so = so_new(13, 2);
2643	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2644	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2645		      NOUVEAU_BO_HIGH, 0, 0);
2646	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2647		      NOUVEAU_BO_LOW, 0, 0);
2648	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2649	so_data  (so, p->cfg.attr[0]);
2650	so_data  (so, p->cfg.attr[1]);
2651	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2652	so_data  (so, p->cfg.high_result);
2653	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2654	so_data  (so, p->cfg.high_result); //8);
2655	so_data  (so, p->cfg.high_temp);
2656	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2657	so_data  (so, 0); /* program start offset */
2658	so_ref(so, &nv50->state.vertprog);
2659	so_ref(NULL, &so);
2660}
2661
2662void
2663nv50_fragprog_validate(struct nv50_context *nv50)
2664{
2665	struct nouveau_grobj *tesla = nv50->screen->tesla;
2666	struct nv50_program *p = nv50->fragprog;
2667	struct nouveau_stateobj *so;
2668
2669	if (!p->translated) {
2670		nv50_program_validate(nv50, p);
2671		if (!p->translated)
2672			assert(0);
2673	}
2674
2675	nv50_program_validate_data(nv50, p);
2676	nv50_program_validate_code(nv50, p);
2677
2678	so = so_new(64, 2);
2679	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2680	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2681		      NOUVEAU_BO_HIGH, 0, 0);
2682	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2683		      NOUVEAU_BO_LOW, 0, 0);
2684	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2685	so_data  (so, p->cfg.high_temp);
2686	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2687	so_data  (so, p->cfg.high_result);
2688	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2689	so_data  (so, p->cfg.regs[2]);
2690	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2691	so_data  (so, p->cfg.regs[3]);
2692	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2693	so_data  (so, 0); /* program start offset */
2694	so_ref(so, &nv50->state.fragprog);
2695	so_ref(NULL, &so);
2696}
2697
2698static int
2699nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
2700	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
2701{
2702	int c;
2703	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
2704	uint8_t *map = (uint8_t *)p_map;
2705
2706	for (c = 0; c < 4; ++c) {
2707		if (mf & 1) {
2708			if (fpi->linear == TRUE)
2709				lin[mid / 32] |= 1 << (mid % 32);
2710			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
2711		}
2712
2713		oid += mv & 1;
2714		mf >>= 1;
2715		mv >>= 1;
2716	}
2717
2718	return mid;
2719}
2720
2721void
2722nv50_linkage_validate(struct nv50_context *nv50)
2723{
2724	struct nouveau_grobj *tesla = nv50->screen->tesla;
2725	struct nv50_program *vp = nv50->vertprog;
2726	struct nv50_program *fp = nv50->fragprog;
2727	struct nouveau_stateobj *so;
2728	struct nv50_sreg4 dummy, *vpo;
2729	int i, n, c, m = 0;
2730	uint32_t map[16], lin[4], reg[5];
2731
2732	memset(map, 0, sizeof(map));
2733	memset(lin, 0, sizeof(lin));
2734
2735	reg[1] = 0x00000004; /* low and high clip distance map ids */
2736	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
2737	reg[3] = 0x00000000; /* point size map id & enable */
2738	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
2739	reg[4] = fp->cfg.regs[1]; /* interpolant info */
2740
2741	dummy.linear = FALSE;
2742	dummy.mask = 0xf; /* map all components of HPOS */
2743	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
2744
2745	dummy.mask = 0x0;
2746
2747	if (vp->cfg.clpd < 0x40) {
2748		for (c = 0; c < vp->cfg.clpd_nr; ++c)
2749			map[m++] = vp->cfg.clpd + c;
2750		reg[1] = (m << 8);
2751	}
2752
2753	reg[0] |= m << 8; /* adjust BFC0 id */
2754
2755	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
2756	if (nv50->rasterizer->pipe.light_twoside) {
2757		vpo = &vp->cfg.two_side[0];
2758
2759		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
2760		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
2761	}
2762
2763	reg[0] += m - 4; /* adjust FFC0 id */
2764	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
2765
2766	i = 0;
2767	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
2768		i = 1;
2769	for (; i < fp->cfg.io_nr; i++) {
2770		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
2771		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
2772
2773		n = fp->cfg.io[i].id_vp;
2774		if (n >= vp->cfg.io_nr ||
2775		    vp->info.output_semantic_name[n] != sn ||
2776		    vp->info.output_semantic_index[n] != si)
2777			vpo = &dummy;
2778		else
2779			vpo = &vp->cfg.io[n];
2780
2781		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
2782	}
2783
2784	/* now fill the stateobj */
2785	so = so_new(64, 0);
2786
2787	n = (m + 3) / 4;
2788	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
2789	so_data  (so, m);
2790	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
2791	so_datap (so, map, n);
2792
2793	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2794	so_datap (so, reg, 4);
2795
2796	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
2797	so_data  (so, reg[4]);
2798
2799	so_method(so, tesla, 0x1540, 4);
2800	so_datap (so, lin, 4);
2801
2802        so_ref(so, &nv50->state.programs);
2803        so_ref(NULL, &so);
2804}
2805
2806void
2807nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2808{
2809	while (p->exec_head) {
2810		struct nv50_program_exec *e = p->exec_head;
2811
2812		p->exec_head = e->next;
2813		FREE(e);
2814	}
2815	p->exec_tail = NULL;
2816	p->exec_size = 0;
2817
2818	nouveau_bo_ref(NULL, &p->bo);
2819
2820	nouveau_resource_free(&p->data[0]);
2821	nouveau_resource_free(&p->data[1]);
2822
2823	p->translated = 0;
2824}
2825