nv50_program.c revision d3a9cf54c0a95fb60ac8921e100d51b53c44541b
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93struct nv50_pc {
94	struct nv50_program *p;
95
96	/* hw resources */
97	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	/* broadcast and destination replacement regs */
116	struct nv50_reg *r_brdc;
117	struct nv50_reg *r_dst[4];
118
119	unsigned interp_mode[32];
120	/* perspective interpolation registers */
121	struct nv50_reg *iv_p;
122	struct nv50_reg *iv_c;
123
124	/* current instruction and total number of insns */
125	unsigned insn_cur;
126	unsigned insn_nr;
127
128	boolean allow32;
129};
130
131static INLINE void
132ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
133{
134	reg->type = type;
135	reg->index = index;
136	reg->hw = hw;
137	reg->neg = 0;
138	reg->rhw = -1;
139	reg->acc = 0;
140}
141
142static INLINE unsigned
143popcnt4(uint32_t val)
144{
145	static const unsigned cnt[16]
146	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
147	return cnt[val & 0xf];
148}
149
150static void
151alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
152{
153	int i = 0;
154
155	if (reg->type == P_RESULT) {
156		if (pc->p->cfg.high_result < (reg->hw + 1))
157			pc->p->cfg.high_result = reg->hw + 1;
158	}
159
160	if (reg->type != P_TEMP)
161		return;
162
163	if (reg->hw >= 0) {
164		/*XXX: do this here too to catch FP temp-as-attr usage..
165		 *     not clean, but works */
166		if (pc->p->cfg.high_temp < (reg->hw + 1))
167			pc->p->cfg.high_temp = reg->hw + 1;
168		return;
169	}
170
171	if (reg->rhw != -1) {
172		/* try to allocate temporary with index rhw first */
173		if (!(pc->r_temp[reg->rhw])) {
174			pc->r_temp[reg->rhw] = reg;
175			reg->hw = reg->rhw;
176			if (pc->p->cfg.high_temp < (reg->rhw + 1))
177				pc->p->cfg.high_temp = reg->rhw + 1;
178			return;
179		}
180		/* make sure we don't get things like $r0 needs to go
181		 * in $r1 and $r1 in $r0
182		 */
183		i = pc->result_nr * 4;
184	}
185
186	for (; i < NV50_SU_MAX_TEMP; i++) {
187		if (!(pc->r_temp[i])) {
188			pc->r_temp[i] = reg;
189			reg->hw = i;
190			if (pc->p->cfg.high_temp < (i + 1))
191				pc->p->cfg.high_temp = i + 1;
192			return;
193		}
194	}
195
196	assert(0);
197}
198
199static struct nv50_reg *
200alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
201{
202	struct nv50_reg *r;
203	int i;
204
205	if (dst && dst->type == P_TEMP && dst->hw == -1)
206		return dst;
207
208	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
209		if (!pc->r_temp[i]) {
210			r = MALLOC_STRUCT(nv50_reg);
211			ctor_reg(r, P_TEMP, -1, i);
212			pc->r_temp[i] = r;
213			return r;
214		}
215	}
216
217	assert(0);
218	return NULL;
219}
220
221/* Assign the hw of the discarded temporary register src
222 * to the tgsi register dst and free src.
223 */
224static void
225assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
226{
227	assert(src->index == -1 && src->hw != -1);
228
229	if (dst->hw != -1)
230		pc->r_temp[dst->hw] = NULL;
231	pc->r_temp[src->hw] = dst;
232	dst->hw = src->hw;
233
234	FREE(src);
235}
236
237/* release the hardware resource held by r */
238static void
239release_hw(struct nv50_pc *pc, struct nv50_reg *r)
240{
241	assert(r->type == P_TEMP);
242	if (r->hw == -1)
243		return;
244
245	assert(pc->r_temp[r->hw] == r);
246	pc->r_temp[r->hw] = NULL;
247
248	r->acc = 0;
249	if (r->index == -1)
250		FREE(r);
251}
252
253static void
254free_temp(struct nv50_pc *pc, struct nv50_reg *r)
255{
256	if (r->index == -1) {
257		unsigned hw = r->hw;
258
259		FREE(pc->r_temp[hw]);
260		pc->r_temp[hw] = NULL;
261	}
262}
263
264static int
265alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
266{
267	int i;
268
269	if ((idx + 4) >= NV50_SU_MAX_TEMP)
270		return 1;
271
272	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
273	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
274		return alloc_temp4(pc, dst, idx + 4);
275
276	for (i = 0; i < 4; i++) {
277		dst[i] = MALLOC_STRUCT(nv50_reg);
278		ctor_reg(dst[i], P_TEMP, -1, idx + i);
279		pc->r_temp[idx + i] = dst[i];
280	}
281
282	return 0;
283}
284
285static void
286free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
287{
288	int i;
289
290	for (i = 0; i < 4; i++)
291		free_temp(pc, reg[i]);
292}
293
294static struct nv50_reg *
295temp_temp(struct nv50_pc *pc)
296{
297	if (pc->temp_temp_nr >= 16)
298		assert(0);
299
300	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
301	return pc->temp_temp[pc->temp_temp_nr++];
302}
303
304static void
305kill_temp_temp(struct nv50_pc *pc)
306{
307	int i;
308
309	for (i = 0; i < pc->temp_temp_nr; i++)
310		free_temp(pc, pc->temp_temp[i]);
311	pc->temp_temp_nr = 0;
312}
313
314static int
315ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
316{
317	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
318			       (pc->immd_nr + 1) * 4 * sizeof(float));
319	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
320	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
321	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
322	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
323
324	return pc->immd_nr++;
325}
326
327static struct nv50_reg *
328alloc_immd(struct nv50_pc *pc, float f)
329{
330	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
331	unsigned hw;
332
333	for (hw = 0; hw < pc->immd_nr * 4; hw++)
334		if (pc->immd_buf[hw] == f)
335			break;
336
337	if (hw == pc->immd_nr * 4)
338		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
339
340	ctor_reg(r, P_IMMD, -1, hw);
341	return r;
342}
343
344static struct nv50_program_exec *
345exec(struct nv50_pc *pc)
346{
347	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
348
349	e->param.index = -1;
350	return e;
351}
352
353static void
354emit(struct nv50_pc *pc, struct nv50_program_exec *e)
355{
356	struct nv50_program *p = pc->p;
357
358	if (p->exec_tail)
359		p->exec_tail->next = e;
360	if (!p->exec_head)
361		p->exec_head = e;
362	p->exec_tail = e;
363	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
364}
365
366static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
367
368static boolean
369is_long(struct nv50_program_exec *e)
370{
371	if (e->inst[0] & 1)
372		return TRUE;
373	return FALSE;
374}
375
376static boolean
377is_immd(struct nv50_program_exec *e)
378{
379	if (is_long(e) && (e->inst[1] & 3) == 3)
380		return TRUE;
381	return FALSE;
382}
383
384static INLINE void
385set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
386	 struct nv50_program_exec *e)
387{
388	set_long(pc, e);
389	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
390	e->inst[1] |= (pred << 7) | (idx << 12);
391}
392
393static INLINE void
394set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
395	    struct nv50_program_exec *e)
396{
397	set_long(pc, e);
398	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
399	e->inst[1] |= (idx << 4) | (on << 6);
400}
401
402static INLINE void
403set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
404{
405	if (is_long(e))
406		return;
407
408	e->inst[0] |= 1;
409	set_pred(pc, 0xf, 0, e);
410	set_pred_wr(pc, 0, 0, e);
411}
412
413static INLINE void
414set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
415{
416	if (dst->type == P_RESULT) {
417		set_long(pc, e);
418		e->inst[1] |= 0x00000008;
419	}
420
421	alloc_reg(pc, dst);
422	e->inst[0] |= (dst->hw << 2);
423}
424
425static INLINE void
426set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
427{
428	float f = pc->immd_buf[imm->hw];
429	unsigned val = fui(imm->neg ? -f : f);
430
431	set_long(pc, e);
432	/*XXX: can't be predicated - bits overlap.. catch cases where both
433	 *     are required and avoid them. */
434	set_pred(pc, 0, 0, e);
435	set_pred_wr(pc, 0, 0, e);
436
437	e->inst[1] |= 0x00000002 | 0x00000001;
438	e->inst[0] |= (val & 0x3f) << 16;
439	e->inst[1] |= (val >> 6) << 2;
440}
441
442
443#define INTERP_LINEAR		0
444#define INTERP_FLAT			1
445#define INTERP_PERSPECTIVE	2
446#define INTERP_CENTROID		4
447
448/* interpolant index has been stored in dst->rhw */
449static void
450emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
451		unsigned mode)
452{
453	assert(dst->rhw != -1);
454	struct nv50_program_exec *e = exec(pc);
455
456	e->inst[0] |= 0x80000000;
457	set_dst(pc, dst, e);
458	e->inst[0] |= (dst->rhw << 16);
459
460	if (mode & INTERP_FLAT) {
461		e->inst[0] |= (1 << 8);
462	} else {
463		if (mode & INTERP_PERSPECTIVE) {
464			e->inst[0] |= (1 << 25);
465			alloc_reg(pc, iv);
466			e->inst[0] |= (iv->hw << 9);
467		}
468
469		if (mode & INTERP_CENTROID)
470			e->inst[0] |= (1 << 24);
471	}
472
473	emit(pc, e);
474}
475
476static void
477set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
478	 struct nv50_program_exec *e)
479{
480	set_long(pc, e);
481
482	e->param.index = src->hw;
483	e->param.shift = s;
484	e->param.mask = m << (s % 32);
485
486	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
487}
488
489static void
490emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
491{
492	struct nv50_program_exec *e = exec(pc);
493
494	e->inst[0] |= 0x10000000;
495
496	set_dst(pc, dst, e);
497
498	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
499		set_immd(pc, src, e);
500		/*XXX: 32-bit, but steals part of "half" reg space - need to
501		 *     catch and handle this case if/when we do half-regs
502		 */
503	} else
504	if (src->type == P_IMMD || src->type == P_CONST) {
505		set_long(pc, e);
506		set_data(pc, src, 0x7f, 9, e);
507		e->inst[1] |= 0x20000000; /* src0 const? */
508	} else {
509		if (src->type == P_ATTR) {
510			set_long(pc, e);
511			e->inst[1] |= 0x00200000;
512		}
513
514		alloc_reg(pc, src);
515		e->inst[0] |= (src->hw << 9);
516	}
517
518	if (is_long(e) && !is_immd(e)) {
519		e->inst[1] |= 0x04000000; /* 32-bit */
520		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
521		if (!(e->inst[1] & 0x20000000))
522			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
523	} else
524		e->inst[0] |= 0x00008000;
525
526	emit(pc, e);
527}
528
529static INLINE void
530emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
531{
532	struct nv50_reg *imm = alloc_immd(pc, f);
533	emit_mov(pc, dst, imm);
534	FREE(imm);
535}
536
537static boolean
538check_swap_src_0_1(struct nv50_pc *pc,
539		   struct nv50_reg **s0, struct nv50_reg **s1)
540{
541	struct nv50_reg *src0 = *s0, *src1 = *s1;
542
543	if (src0->type == P_CONST) {
544		if (src1->type != P_CONST) {
545			*s0 = src1;
546			*s1 = src0;
547			return TRUE;
548		}
549	} else
550	if (src1->type == P_ATTR) {
551		if (src0->type != P_ATTR) {
552			*s0 = src1;
553			*s1 = src0;
554			return TRUE;
555		}
556	}
557
558	return FALSE;
559}
560
561static void
562set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
563{
564	if (src->type == P_ATTR) {
565		set_long(pc, e);
566		e->inst[1] |= 0x00200000;
567	} else
568	if (src->type == P_CONST || src->type == P_IMMD) {
569		struct nv50_reg *temp = temp_temp(pc);
570
571		emit_mov(pc, temp, src);
572		src = temp;
573	}
574
575	alloc_reg(pc, src);
576	e->inst[0] |= (src->hw << 9);
577}
578
579static void
580set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
581{
582	if (src->type == P_ATTR) {
583		struct nv50_reg *temp = temp_temp(pc);
584
585		emit_mov(pc, temp, src);
586		src = temp;
587	} else
588	if (src->type == P_CONST || src->type == P_IMMD) {
589		assert(!(e->inst[0] & 0x00800000));
590		if (e->inst[0] & 0x01000000) {
591			struct nv50_reg *temp = temp_temp(pc);
592
593			emit_mov(pc, temp, src);
594			src = temp;
595		} else {
596			set_data(pc, src, 0x7f, 16, e);
597			e->inst[0] |= 0x00800000;
598		}
599	}
600
601	alloc_reg(pc, src);
602	e->inst[0] |= (src->hw << 16);
603}
604
605static void
606set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
607{
608	set_long(pc, e);
609
610	if (src->type == P_ATTR) {
611		struct nv50_reg *temp = temp_temp(pc);
612
613		emit_mov(pc, temp, src);
614		src = temp;
615	} else
616	if (src->type == P_CONST || src->type == P_IMMD) {
617		assert(!(e->inst[0] & 0x01000000));
618		if (e->inst[0] & 0x00800000) {
619			struct nv50_reg *temp = temp_temp(pc);
620
621			emit_mov(pc, temp, src);
622			src = temp;
623		} else {
624			set_data(pc, src, 0x7f, 32+14, e);
625			e->inst[0] |= 0x01000000;
626		}
627	}
628
629	alloc_reg(pc, src);
630	e->inst[1] |= (src->hw << 14);
631}
632
633static void
634emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
635	 struct nv50_reg *src1)
636{
637	struct nv50_program_exec *e = exec(pc);
638
639	e->inst[0] |= 0xc0000000;
640
641	if (!pc->allow32)
642		set_long(pc, e);
643
644	check_swap_src_0_1(pc, &src0, &src1);
645	set_dst(pc, dst, e);
646	set_src_0(pc, src0, e);
647	if (src1->type == P_IMMD && !is_long(e)) {
648		if (src0->neg)
649			e->inst[0] |= 0x00008000;
650		set_immd(pc, src1, e);
651	} else {
652		set_src_1(pc, src1, e);
653		if (src0->neg ^ src1->neg) {
654			if (is_long(e))
655				e->inst[1] |= 0x08000000;
656			else
657				e->inst[0] |= 0x00008000;
658		}
659	}
660
661	emit(pc, e);
662}
663
664static void
665emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
666	 struct nv50_reg *src0, struct nv50_reg *src1)
667{
668	struct nv50_program_exec *e = exec(pc);
669
670	e->inst[0] |= 0xb0000000;
671
672	check_swap_src_0_1(pc, &src0, &src1);
673
674	if (!pc->allow32 || src0->neg || src1->neg) {
675		set_long(pc, e);
676		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
677	}
678
679	set_dst(pc, dst, e);
680	set_src_0(pc, src0, e);
681	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
682		set_src_2(pc, src1, e);
683	else
684	if (src1->type == P_IMMD)
685		set_immd(pc, src1, e);
686	else
687		set_src_1(pc, src1, e);
688
689	emit(pc, e);
690}
691
692static void
693emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
694	    struct nv50_reg *src0, struct nv50_reg *src1)
695{
696	struct nv50_program_exec *e = exec(pc);
697
698	set_long(pc, e);
699	e->inst[0] |= 0xb0000000;
700	e->inst[1] |= (sub << 29);
701
702	check_swap_src_0_1(pc, &src0, &src1);
703	set_dst(pc, dst, e);
704	set_src_0(pc, src0, e);
705	set_src_1(pc, src1, e);
706
707	emit(pc, e);
708}
709
710static INLINE void
711emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
712	 struct nv50_reg *src1)
713{
714	src1->neg ^= 1;
715	emit_add(pc, dst, src0, src1);
716	src1->neg ^= 1;
717}
718
719static void
720emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
721	 struct nv50_reg *src1, struct nv50_reg *src2)
722{
723	struct nv50_program_exec *e = exec(pc);
724
725	e->inst[0] |= 0xe0000000;
726
727	check_swap_src_0_1(pc, &src0, &src1);
728	set_dst(pc, dst, e);
729	set_src_0(pc, src0, e);
730	set_src_1(pc, src1, e);
731	set_src_2(pc, src2, e);
732
733	if (src0->neg ^ src1->neg)
734		e->inst[1] |= 0x04000000;
735	if (src2->neg)
736		e->inst[1] |= 0x08000000;
737
738	emit(pc, e);
739}
740
741static INLINE void
742emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
743	 struct nv50_reg *src1, struct nv50_reg *src2)
744{
745	src2->neg ^= 1;
746	emit_mad(pc, dst, src0, src1, src2);
747	src2->neg ^= 1;
748}
749
750static void
751emit_flop(struct nv50_pc *pc, unsigned sub,
752	  struct nv50_reg *dst, struct nv50_reg *src)
753{
754	struct nv50_program_exec *e = exec(pc);
755
756	e->inst[0] |= 0x90000000;
757	if (sub) {
758		set_long(pc, e);
759		e->inst[1] |= (sub << 29);
760	}
761
762	set_dst(pc, dst, e);
763	set_src_0(pc, src, e);
764
765	emit(pc, e);
766}
767
768static void
769emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
770{
771	struct nv50_program_exec *e = exec(pc);
772
773	e->inst[0] |= 0xb0000000;
774
775	set_dst(pc, dst, e);
776	set_src_0(pc, src, e);
777	set_long(pc, e);
778	e->inst[1] |= (6 << 29) | 0x00004000;
779
780	emit(pc, e);
781}
782
783static void
784emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
785{
786	struct nv50_program_exec *e = exec(pc);
787
788	e->inst[0] |= 0xb0000000;
789
790	set_dst(pc, dst, e);
791	set_src_0(pc, src, e);
792	set_long(pc, e);
793	e->inst[1] |= (6 << 29);
794
795	emit(pc, e);
796}
797
798#define CVTOP_RN	0x01
799#define CVTOP_FLOOR	0x03
800#define CVTOP_CEIL	0x05
801#define CVTOP_TRUNC	0x07
802#define CVTOP_SAT	0x08
803#define CVTOP_ABS	0x10
804
805/* 0x04 == 32 bit */
806/* 0x40 == dst is float */
807/* 0x80 == src is float */
808#define CVT_F32_F32 0xc4
809#define CVT_F32_S32 0x44
810#define CVT_F32_U32 0x64
811#define CVT_S32_F32 0x8c
812#define CVT_S32_S32 0x0c
813#define CVT_F32_F32_ROP 0xcc
814
815static void
816emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
817	 int wp, unsigned cvn, unsigned fmt)
818{
819	struct nv50_program_exec *e;
820
821	e = exec(pc);
822	set_long(pc, e);
823
824	e->inst[0] |= 0xa0000000;
825	e->inst[1] |= 0x00004000;
826	e->inst[1] |= (cvn << 16);
827	e->inst[1] |= (fmt << 24);
828	set_src_0(pc, src, e);
829
830	if (wp >= 0)
831		set_pred_wr(pc, 1, wp, e);
832
833	if (dst)
834		set_dst(pc, dst, e);
835	else {
836		e->inst[0] |= 0x000001fc;
837		e->inst[1] |= 0x00000008;
838	}
839
840	emit(pc, e);
841}
842
843/* nv50 Condition codes:
844 *  0x1 = LT
845 *  0x2 = EQ
846 *  0x3 = LE
847 *  0x4 = GT
848 *  0x5 = NE
849 *  0x6 = GE
850 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
851 *  0x8 = unordered bit (allows NaN)
852 */
853static void
854emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
855	 struct nv50_reg *src0, struct nv50_reg *src1)
856{
857	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
858
859	struct nv50_program_exec *e = exec(pc);
860	struct nv50_reg *rdst;
861
862	assert(ccode < 16);
863	if (check_swap_src_0_1(pc, &src0, &src1))
864		ccode = cc_swapped[ccode & 7] | (ccode & 8);
865
866	rdst = dst;
867	if (dst && dst->type != P_TEMP)
868		dst = alloc_temp(pc, NULL);
869
870	/* set.u32 */
871	set_long(pc, e);
872	e->inst[0] |= 0xb0000000;
873	e->inst[1] |= 0x60000000 | (ccode << 14);
874
875	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
876	 * that doesn't seem to match what the hw actually does
877	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
878	 */
879
880	if (wp >= 0)
881		set_pred_wr(pc, 1, wp, e);
882	if (dst)
883		set_dst(pc, dst, e);
884	else {
885		e->inst[0] |= 0x000001fc;
886		e->inst[1] |= 0x00000008;
887	}
888
889	set_src_0(pc, src0, e);
890	set_src_1(pc, src1, e);
891
892	emit(pc, e);
893
894	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
895	if (rdst)
896		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
897	if (rdst && rdst != dst)
898		free_temp(pc, dst);
899}
900
901static INLINE unsigned
902map_tgsi_setop_cc(unsigned op)
903{
904	switch (op) {
905	case TGSI_OPCODE_SLT: return 0x1;
906	case TGSI_OPCODE_SGE: return 0x6;
907	case TGSI_OPCODE_SEQ: return 0x2;
908	case TGSI_OPCODE_SGT: return 0x4;
909	case TGSI_OPCODE_SLE: return 0x3;
910	case TGSI_OPCODE_SNE: return 0xd;
911	default:
912		assert(0);
913		return 0;
914	}
915}
916
917static INLINE void
918emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
919{
920	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
921}
922
923static void
924emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
925	 struct nv50_reg *v, struct nv50_reg *e)
926{
927	struct nv50_reg *temp = alloc_temp(pc, NULL);
928
929	emit_flop(pc, 3, temp, v);
930	emit_mul(pc, temp, temp, e);
931	emit_preex2(pc, temp, temp);
932	emit_flop(pc, 6, dst, temp);
933
934	free_temp(pc, temp);
935}
936
937static INLINE void
938emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
939{
940	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
941}
942
943static INLINE void
944emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
945{
946	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
947}
948
949static void
950emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
951	 struct nv50_reg **src)
952{
953	struct nv50_reg *one = alloc_immd(pc, 1.0);
954	struct nv50_reg *zero = alloc_immd(pc, 0.0);
955	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
956	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
957	struct nv50_reg *tmp[4];
958	boolean allow32 = pc->allow32;
959
960	pc->allow32 = FALSE;
961
962	if (mask & (3 << 1)) {
963		tmp[0] = alloc_temp(pc, NULL);
964		emit_minmax(pc, 4, tmp[0], src[0], zero);
965	}
966
967	if (mask & (1 << 2)) {
968		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
969
970		tmp[1] = temp_temp(pc);
971		emit_minmax(pc, 4, tmp[1], src[1], zero);
972
973		tmp[3] = temp_temp(pc);
974		emit_minmax(pc, 4, tmp[3], src[3], neg128);
975		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
976
977		emit_pow(pc, dst[2], tmp[1], tmp[3]);
978		emit_mov(pc, dst[2], zero);
979		set_pred(pc, 3, 0, pc->p->exec_tail);
980	}
981
982	if (mask & (1 << 1))
983		assimilate_temp(pc, dst[1], tmp[0]);
984	else
985	if (mask & (1 << 2))
986		free_temp(pc, tmp[0]);
987
988	pc->allow32 = allow32;
989
990	/* do this last, in case src[i,j] == dst[0,3] */
991	if (mask & (1 << 0))
992		emit_mov(pc, dst[0], one);
993
994	if (mask & (1 << 3))
995		emit_mov(pc, dst[3], one);
996
997	FREE(pos128);
998	FREE(neg128);
999	FREE(zero);
1000	FREE(one);
1001}
1002
1003static void
1004emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1005{
1006	struct nv50_program_exec *e = exec(pc);
1007
1008	set_long(pc, e);
1009	e->inst[0] |= 0xa0000000; /* delta */
1010	e->inst[1] |= (7 << 29); /* delta */
1011	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
1012	e->inst[1] |= (1 << 14); /* src .f32 */
1013	set_dst(pc, dst, e);
1014	set_src_0(pc, src, e);
1015
1016	emit(pc, e);
1017}
1018
1019static void
1020emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1021{
1022	struct nv50_program_exec *e;
1023	const int r_pred = 1;
1024
1025	/* Sets predicate reg ? */
1026	e = exec(pc);
1027	e->inst[0] = 0xa00001fd;
1028	e->inst[1] = 0xc4014788;
1029	set_src_0(pc, src, e);
1030	set_pred_wr(pc, 1, r_pred, e);
1031	if (src->neg)
1032		e->inst[1] |= 0x20000000;
1033	emit(pc, e);
1034
1035	/* This is probably KILP */
1036	e = exec(pc);
1037	e->inst[0] = 0x000001fe;
1038	set_long(pc, e);
1039	set_pred(pc, 1 /* LT? */, r_pred, e);
1040	emit(pc, e);
1041}
1042
1043static void
1044emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1045	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1046{
1047	struct nv50_reg *temp, *t[4];
1048	struct nv50_program_exec *e;
1049
1050	unsigned c, mode, dim;
1051
1052	switch (type) {
1053	case TGSI_TEXTURE_1D:
1054		dim = 1;
1055		break;
1056	case TGSI_TEXTURE_UNKNOWN:
1057	case TGSI_TEXTURE_2D:
1058	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1059	case TGSI_TEXTURE_RECT:
1060		dim = 2;
1061		break;
1062	case TGSI_TEXTURE_3D:
1063	case TGSI_TEXTURE_CUBE:
1064	case TGSI_TEXTURE_SHADOW2D:
1065	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1066		dim = 3;
1067		break;
1068	default:
1069		assert(0);
1070		break;
1071	}
1072
1073	/* some cards need t[0]'s hw index to be a multiple of 4 */
1074	alloc_temp4(pc, t, 0);
1075
1076	if (proj) {
1077		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1078			mode = pc->interp_mode[src[0]->index];
1079
1080			t[3]->rhw = src[3]->rhw;
1081			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1082			emit_flop(pc, 0, t[3], t[3]);
1083
1084			for (c = 0; c < dim; c++) {
1085				t[c]->rhw = src[c]->rhw;
1086				emit_interp(pc, t[c], t[3],
1087					    (mode | INTERP_PERSPECTIVE));
1088			}
1089		} else {
1090			emit_flop(pc, 0, t[3], src[3]);
1091			for (c = 0; c < dim; c++)
1092				emit_mul(pc, t[c], src[c], t[3]);
1093
1094			/* XXX: for some reason the blob sometimes uses MAD:
1095			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1096			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1097			 */
1098		}
1099	} else {
1100		if (type == TGSI_TEXTURE_CUBE) {
1101			temp = temp_temp(pc);
1102			emit_minmax(pc, 4, temp, src[0], src[1]);
1103			emit_minmax(pc, 4, temp, temp, src[2]);
1104			emit_flop(pc, 0, temp, temp);
1105			for (c = 0; c < 3; c++)
1106				emit_mul(pc, t[c], src[c], temp);
1107		} else {
1108			for (c = 0; c < dim; c++)
1109				emit_mov(pc, t[c], src[c]);
1110		}
1111	}
1112
1113	e = exec(pc);
1114	set_long(pc, e);
1115	e->inst[0] |= 0xf0000000;
1116	e->inst[1] |= 0x00000004;
1117	set_dst(pc, t[0], e);
1118	e->inst[0] |= (unit << 9);
1119
1120	if (dim == 2)
1121		e->inst[0] |= 0x00400000;
1122	else
1123	if (dim == 3)
1124		e->inst[0] |= 0x00800000;
1125
1126	e->inst[0] |= (mask & 0x3) << 25;
1127	e->inst[1] |= (mask & 0xc) << 12;
1128
1129	emit(pc, e);
1130
1131#if 1
1132	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1133	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1134	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1135	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1136
1137	free_temp4(pc, t);
1138#else
1139	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1140	 * the texture coordinates, not the fetched values: latency ? */
1141
1142	for (c = 0; c < 4; c++) {
1143		if (mask & (1 << c))
1144			assimilate_temp(pc, dst[c], t[c]);
1145		else
1146			free_temp(pc, t[c]);
1147	}
1148#endif
1149}
1150
1151static void
1152convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1153{
1154	unsigned q = 0, m = ~0;
1155
1156	assert(!is_long(e));
1157
1158	switch (e->inst[0] >> 28) {
1159	case 0x1:
1160		/* MOV */
1161		q = 0x0403c000;
1162		m = 0xffff7fff;
1163		break;
1164	case 0x8:
1165		/* INTERP (move centroid, perspective and flat bits) */
1166		m = ~0x03000100;
1167		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1168		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1169		break;
1170	case 0x9:
1171		/* RCP */
1172		break;
1173	case 0xB:
1174		/* ADD */
1175		m = ~(127 << 16);
1176		q = ((e->inst[0] & (~m)) >> 2);
1177		break;
1178	case 0xC:
1179		/* MUL */
1180		m = ~0x00008000;
1181		q = ((e->inst[0] & (~m)) << 12);
1182		break;
1183	case 0xE:
1184		/* MAD (if src2 == dst) */
1185		q = ((e->inst[0] & 0x1fc) << 12);
1186		break;
1187	default:
1188		assert(0);
1189		break;
1190	}
1191
1192	set_long(pc, e);
1193	pc->p->exec_size++;
1194
1195	e->inst[0] &= m;
1196	e->inst[1] |= q;
1197}
1198
1199static boolean
1200negate_supported(const struct tgsi_full_instruction *insn, int i)
1201{
1202	switch (insn->Instruction.Opcode) {
1203	case TGSI_OPCODE_DP3:
1204	case TGSI_OPCODE_DP4:
1205	case TGSI_OPCODE_MUL:
1206	case TGSI_OPCODE_KIL:
1207	case TGSI_OPCODE_ADD:
1208	case TGSI_OPCODE_SUB:
1209	case TGSI_OPCODE_MAD:
1210		return TRUE;
1211	case TGSI_OPCODE_POW:
1212		return (i == 1) ? TRUE : FALSE;
1213	default:
1214		return FALSE;
1215	}
1216}
1217
1218/* Return a read mask for source registers deduced from opcode & write mask. */
1219static unsigned
1220nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1221{
1222	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1223
1224	switch (insn->Instruction.Opcode) {
1225	case TGSI_OPCODE_COS:
1226	case TGSI_OPCODE_SIN:
1227		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1228	case TGSI_OPCODE_DP3:
1229		return 0x7;
1230	case TGSI_OPCODE_DP4:
1231	case TGSI_OPCODE_DPH:
1232	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1233		return 0xf;
1234	case TGSI_OPCODE_DST:
1235		return mask & (c ? 0xa : 0x6);
1236	case TGSI_OPCODE_EX2:
1237	case TGSI_OPCODE_LG2:
1238	case TGSI_OPCODE_POW:
1239	case TGSI_OPCODE_RCP:
1240	case TGSI_OPCODE_RSQ:
1241	case TGSI_OPCODE_SCS:
1242		return 0x1;
1243	case TGSI_OPCODE_LIT:
1244		return 0xb;
1245	case TGSI_OPCODE_TEX:
1246	case TGSI_OPCODE_TXP:
1247	{
1248		const struct tgsi_instruction_ext_texture *tex;
1249
1250		assert(insn->Instruction.Extended);
1251		tex = &insn->InstructionExtTexture;
1252
1253		mask = 0x7;
1254		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1255			mask |= 0x8;
1256
1257		switch (tex->Texture) {
1258		case TGSI_TEXTURE_1D:
1259			mask &= 0x9;
1260			break;
1261		case TGSI_TEXTURE_2D:
1262			mask &= 0xb;
1263			break;
1264		default:
1265			break;
1266		}
1267	}
1268		return mask;
1269	case TGSI_OPCODE_XPD:
1270		x = 0;
1271		if (mask & 1) x |= 0x6;
1272		if (mask & 2) x |= 0x5;
1273		if (mask & 4) x |= 0x3;
1274		return x;
1275	default:
1276		break;
1277	}
1278
1279	return mask;
1280}
1281
1282static struct nv50_reg *
1283tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1284{
1285	switch (dst->DstRegister.File) {
1286	case TGSI_FILE_TEMPORARY:
1287		return &pc->temp[dst->DstRegister.Index * 4 + c];
1288	case TGSI_FILE_OUTPUT:
1289		return &pc->result[dst->DstRegister.Index * 4 + c];
1290	case TGSI_FILE_NULL:
1291		return NULL;
1292	default:
1293		break;
1294	}
1295
1296	return NULL;
1297}
1298
1299static struct nv50_reg *
1300tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1301	 boolean neg)
1302{
1303	struct nv50_reg *r = NULL;
1304	struct nv50_reg *temp;
1305	unsigned sgn, c;
1306
1307	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1308
1309	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1310	switch (c) {
1311	case TGSI_EXTSWIZZLE_X:
1312	case TGSI_EXTSWIZZLE_Y:
1313	case TGSI_EXTSWIZZLE_Z:
1314	case TGSI_EXTSWIZZLE_W:
1315		switch (src->SrcRegister.File) {
1316		case TGSI_FILE_INPUT:
1317			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1318			break;
1319		case TGSI_FILE_TEMPORARY:
1320			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1321			break;
1322		case TGSI_FILE_CONSTANT:
1323			r = &pc->param[src->SrcRegister.Index * 4 + c];
1324			break;
1325		case TGSI_FILE_IMMEDIATE:
1326			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1327			break;
1328		case TGSI_FILE_SAMPLER:
1329			break;
1330		default:
1331			assert(0);
1332			break;
1333		}
1334		break;
1335	case TGSI_EXTSWIZZLE_ZERO:
1336		r = alloc_immd(pc, 0.0);
1337		return r;
1338	case TGSI_EXTSWIZZLE_ONE:
1339		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1340			return alloc_immd(pc, -1.0);
1341		return alloc_immd(pc, 1.0);
1342	default:
1343		assert(0);
1344		break;
1345	}
1346
1347	switch (sgn) {
1348	case TGSI_UTIL_SIGN_KEEP:
1349		break;
1350	case TGSI_UTIL_SIGN_CLEAR:
1351		temp = temp_temp(pc);
1352		emit_abs(pc, temp, r);
1353		r = temp;
1354		break;
1355	case TGSI_UTIL_SIGN_TOGGLE:
1356		if (neg)
1357			r->neg = 1;
1358		else {
1359			temp = temp_temp(pc);
1360			emit_neg(pc, temp, r);
1361			r = temp;
1362		}
1363		break;
1364	case TGSI_UTIL_SIGN_SET:
1365		temp = temp_temp(pc);
1366		emit_abs(pc, temp, r);
1367		if (neg)
1368			temp->neg = 1;
1369		else
1370			emit_neg(pc, temp, temp);
1371		r = temp;
1372		break;
1373	default:
1374		assert(0);
1375		break;
1376	}
1377
1378	return r;
1379}
1380
1381/* return TRUE for ops that produce only a single result */
1382static boolean
1383is_scalar_op(unsigned op)
1384{
1385	switch (op) {
1386	case TGSI_OPCODE_COS:
1387	case TGSI_OPCODE_DP2:
1388	case TGSI_OPCODE_DP3:
1389	case TGSI_OPCODE_DP4:
1390	case TGSI_OPCODE_DPH:
1391	case TGSI_OPCODE_EX2:
1392	case TGSI_OPCODE_LG2:
1393	case TGSI_OPCODE_POW:
1394	case TGSI_OPCODE_RCP:
1395	case TGSI_OPCODE_RSQ:
1396	case TGSI_OPCODE_SIN:
1397		/*
1398	case TGSI_OPCODE_KIL:
1399	case TGSI_OPCODE_LIT:
1400	case TGSI_OPCODE_SCS:
1401		*/
1402		return TRUE;
1403	default:
1404		return FALSE;
1405	}
1406}
1407
1408/* Returns a bitmask indicating which dst components depend
1409 * on source s, component c (reverse of nv50_tgsi_src_mask).
1410 */
1411static unsigned
1412nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1413{
1414	if (is_scalar_op(op))
1415		return 0x1;
1416
1417	switch (op) {
1418	case TGSI_OPCODE_DST:
1419		return (1 << c) & (s ? 0xa : 0x6);
1420	case TGSI_OPCODE_XPD:
1421		switch (c) {
1422		case 0: return 0x6;
1423		case 1: return 0x5;
1424		case 2: return 0x3;
1425		case 3: return 0x0;
1426		default:
1427			assert(0);
1428			return 0x0;
1429		}
1430	case TGSI_OPCODE_LIT:
1431	case TGSI_OPCODE_SCS:
1432	case TGSI_OPCODE_TEX:
1433	case TGSI_OPCODE_TXP:
1434		/* these take care of dangerous swizzles themselves */
1435		return 0x0;
1436	case TGSI_OPCODE_IF:
1437	case TGSI_OPCODE_KIL:
1438		/* don't call this function for these ops */
1439		assert(0);
1440		return 0;
1441	default:
1442		/* linear vector instruction */
1443		return (1 << c);
1444	}
1445}
1446
1447static boolean
1448nv50_program_tx_insn(struct nv50_pc *pc,
1449		     const struct tgsi_full_instruction *inst)
1450{
1451	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1452	unsigned mask, sat, unit;
1453	int i, c;
1454
1455	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1456	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1457
1458	memset(src, 0, sizeof(src));
1459
1460	for (c = 0; c < 4; c++) {
1461		if ((mask & (1 << c)) && !pc->r_dst[c])
1462			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1463		else
1464			dst[c] = pc->r_dst[c];
1465		rdst[c] = dst[c];
1466	}
1467
1468	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1469		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1470		unsigned src_mask;
1471		boolean neg_supp;
1472
1473		src_mask = nv50_tgsi_src_mask(inst, i);
1474		neg_supp = negate_supported(inst, i);
1475
1476		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1477			unit = fs->SrcRegister.Index;
1478
1479		for (c = 0; c < 4; c++)
1480			if (src_mask & (1 << c))
1481				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1482	}
1483
1484	brdc = temp = pc->r_brdc;
1485	if (brdc && brdc->type != P_TEMP) {
1486		temp = temp_temp(pc);
1487		if (sat)
1488			brdc = temp;
1489	} else
1490	if (sat) {
1491		for (c = 0; c < 4; c++) {
1492			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1493				continue;
1494			rdst[c] = dst[c];
1495			dst[c] = temp_temp(pc);
1496		}
1497	}
1498
1499	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1500
1501	switch (inst->Instruction.Opcode) {
1502	case TGSI_OPCODE_ABS:
1503		for (c = 0; c < 4; c++) {
1504			if (!(mask & (1 << c)))
1505				continue;
1506			emit_abs(pc, dst[c], src[0][c]);
1507		}
1508		break;
1509	case TGSI_OPCODE_ADD:
1510		for (c = 0; c < 4; c++) {
1511			if (!(mask & (1 << c)))
1512				continue;
1513			emit_add(pc, dst[c], src[0][c], src[1][c]);
1514		}
1515		break;
1516	case TGSI_OPCODE_CEIL:
1517		for (c = 0; c < 4; c++) {
1518			if (!(mask & (1 << c)))
1519				continue;
1520			emit_cvt(pc, dst[c], src[0][c], -1,
1521				 CVTOP_CEIL, CVT_F32_F32);
1522		}
1523		break;
1524	case TGSI_OPCODE_COS:
1525		if (mask & 8) {
1526			emit_precossin(pc, temp, src[0][3]);
1527			emit_flop(pc, 5, dst[3], temp);
1528			if (!(mask &= 7))
1529				break;
1530			if (temp == dst[3])
1531				temp = brdc = temp_temp(pc);
1532		}
1533		emit_precossin(pc, temp, src[0][0]);
1534		emit_flop(pc, 5, brdc, temp);
1535		break;
1536	case TGSI_OPCODE_DP3:
1537		emit_mul(pc, temp, src[0][0], src[1][0]);
1538		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1539		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1540		break;
1541	case TGSI_OPCODE_DP4:
1542		emit_mul(pc, temp, src[0][0], src[1][0]);
1543		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1544		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1545		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1546		break;
1547	case TGSI_OPCODE_DPH:
1548		emit_mul(pc, temp, src[0][0], src[1][0]);
1549		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1550		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1551		emit_add(pc, brdc, src[1][3], temp);
1552		break;
1553	case TGSI_OPCODE_DST:
1554		if (mask & (1 << 1))
1555			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1556		if (mask & (1 << 2))
1557			emit_mov(pc, dst[2], src[0][2]);
1558		if (mask & (1 << 3))
1559			emit_mov(pc, dst[3], src[1][3]);
1560		if (mask & (1 << 0))
1561			emit_mov_immdval(pc, dst[0], 1.0f);
1562		break;
1563	case TGSI_OPCODE_EX2:
1564		emit_preex2(pc, temp, src[0][0]);
1565		emit_flop(pc, 6, brdc, temp);
1566		break;
1567	case TGSI_OPCODE_FLR:
1568		for (c = 0; c < 4; c++) {
1569			if (!(mask & (1 << c)))
1570				continue;
1571			emit_flr(pc, dst[c], src[0][c]);
1572		}
1573		break;
1574	case TGSI_OPCODE_FRC:
1575		temp = temp_temp(pc);
1576		for (c = 0; c < 4; c++) {
1577			if (!(mask & (1 << c)))
1578				continue;
1579			emit_flr(pc, temp, src[0][c]);
1580			emit_sub(pc, dst[c], src[0][c], temp);
1581		}
1582		break;
1583	case TGSI_OPCODE_KIL:
1584		emit_kil(pc, src[0][0]);
1585		emit_kil(pc, src[0][1]);
1586		emit_kil(pc, src[0][2]);
1587		emit_kil(pc, src[0][3]);
1588		break;
1589	case TGSI_OPCODE_LIT:
1590		emit_lit(pc, &dst[0], mask, &src[0][0]);
1591		break;
1592	case TGSI_OPCODE_LG2:
1593		emit_flop(pc, 3, brdc, src[0][0]);
1594		break;
1595	case TGSI_OPCODE_LRP:
1596		temp = temp_temp(pc);
1597		for (c = 0; c < 4; c++) {
1598			if (!(mask & (1 << c)))
1599				continue;
1600			emit_sub(pc, temp, src[1][c], src[2][c]);
1601			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1602		}
1603		break;
1604	case TGSI_OPCODE_MAD:
1605		for (c = 0; c < 4; c++) {
1606			if (!(mask & (1 << c)))
1607				continue;
1608			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1609		}
1610		break;
1611	case TGSI_OPCODE_MAX:
1612		for (c = 0; c < 4; c++) {
1613			if (!(mask & (1 << c)))
1614				continue;
1615			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1616		}
1617		break;
1618	case TGSI_OPCODE_MIN:
1619		for (c = 0; c < 4; c++) {
1620			if (!(mask & (1 << c)))
1621				continue;
1622			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1623		}
1624		break;
1625	case TGSI_OPCODE_MOV:
1626	case TGSI_OPCODE_SWZ:
1627		for (c = 0; c < 4; c++) {
1628			if (!(mask & (1 << c)))
1629				continue;
1630			emit_mov(pc, dst[c], src[0][c]);
1631		}
1632		break;
1633	case TGSI_OPCODE_MUL:
1634		for (c = 0; c < 4; c++) {
1635			if (!(mask & (1 << c)))
1636				continue;
1637			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1638		}
1639		break;
1640	case TGSI_OPCODE_POW:
1641		emit_pow(pc, brdc, src[0][0], src[1][0]);
1642		break;
1643	case TGSI_OPCODE_RCP:
1644		emit_flop(pc, 0, brdc, src[0][0]);
1645		break;
1646	case TGSI_OPCODE_RSQ:
1647		emit_flop(pc, 2, brdc, src[0][0]);
1648		break;
1649	case TGSI_OPCODE_SCS:
1650		temp = temp_temp(pc);
1651		if (mask & 3)
1652			emit_precossin(pc, temp, src[0][0]);
1653		if (mask & (1 << 0))
1654			emit_flop(pc, 5, dst[0], temp);
1655		if (mask & (1 << 1))
1656			emit_flop(pc, 4, dst[1], temp);
1657		if (mask & (1 << 2))
1658			emit_mov_immdval(pc, dst[2], 0.0);
1659		if (mask & (1 << 3))
1660			emit_mov_immdval(pc, dst[3], 1.0);
1661		break;
1662	case TGSI_OPCODE_SIN:
1663		if (mask & 8) {
1664			emit_precossin(pc, temp, src[0][3]);
1665			emit_flop(pc, 4, dst[3], temp);
1666			if (!(mask &= 7))
1667				break;
1668			if (temp == dst[3])
1669				temp = brdc = temp_temp(pc);
1670		}
1671		emit_precossin(pc, temp, src[0][0]);
1672		emit_flop(pc, 4, brdc, temp);
1673		break;
1674	case TGSI_OPCODE_SLT:
1675	case TGSI_OPCODE_SGE:
1676	case TGSI_OPCODE_SEQ:
1677	case TGSI_OPCODE_SGT:
1678	case TGSI_OPCODE_SLE:
1679	case TGSI_OPCODE_SNE:
1680		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1681		for (c = 0; c < 4; c++) {
1682			if (!(mask & (1 << c)))
1683				continue;
1684			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1685		}
1686		break;
1687	case TGSI_OPCODE_SUB:
1688		for (c = 0; c < 4; c++) {
1689			if (!(mask & (1 << c)))
1690				continue;
1691			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1692		}
1693		break;
1694	case TGSI_OPCODE_TEX:
1695		emit_tex(pc, dst, mask, src[0], unit,
1696			 inst->InstructionExtTexture.Texture, FALSE);
1697		break;
1698	case TGSI_OPCODE_TXP:
1699		emit_tex(pc, dst, mask, src[0], unit,
1700			 inst->InstructionExtTexture.Texture, TRUE);
1701		break;
1702	case TGSI_OPCODE_TRUNC:
1703		for (c = 0; c < 4; c++) {
1704			if (!(mask & (1 << c)))
1705				continue;
1706			emit_cvt(pc, dst[c], src[0][c], -1,
1707				 CVTOP_TRUNC, CVT_F32_F32);
1708		}
1709		break;
1710	case TGSI_OPCODE_XPD:
1711		temp = temp_temp(pc);
1712		if (mask & (1 << 0)) {
1713			emit_mul(pc, temp, src[0][2], src[1][1]);
1714			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1715		}
1716		if (mask & (1 << 1)) {
1717			emit_mul(pc, temp, src[0][0], src[1][2]);
1718			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1719		}
1720		if (mask & (1 << 2)) {
1721			emit_mul(pc, temp, src[0][1], src[1][0]);
1722			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1723		}
1724		if (mask & (1 << 3))
1725			emit_mov_immdval(pc, dst[3], 1.0);
1726		break;
1727	case TGSI_OPCODE_END:
1728		break;
1729	default:
1730		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1731		return FALSE;
1732	}
1733
1734	if (brdc) {
1735		if (sat)
1736			emit_sat(pc, brdc, brdc);
1737		for (c = 0; c < 4; c++)
1738			if ((mask & (1 << c)) && dst[c] != brdc)
1739				emit_mov(pc, dst[c], brdc);
1740	} else
1741	if (sat) {
1742		for (c = 0; c < 4; c++) {
1743			if (!(mask & (1 << c)))
1744				continue;
1745			/* in this case we saturate later */
1746			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1747				continue;
1748			emit_sat(pc, rdst[c], dst[c]);
1749		}
1750	}
1751
1752	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1753		for (c = 0; c < 4; c++) {
1754			if (!src[i][c])
1755				continue;
1756			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1757				FREE(src[i][c]);
1758		}
1759	}
1760
1761	kill_temp_temp(pc);
1762	return TRUE;
1763}
1764
1765static void
1766prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
1767{
1768	struct nv50_reg *reg = NULL;
1769	const struct tgsi_full_src_register *src;
1770	const struct tgsi_dst_register *dst;
1771	unsigned i, c, k, mask;
1772
1773	dst = &insn->FullDstRegisters[0].DstRegister;
1774	mask = dst->WriteMask;
1775
1776        if (dst->File == TGSI_FILE_TEMPORARY)
1777                reg = pc->temp;
1778        else
1779        if (dst->File == TGSI_FILE_OUTPUT)
1780                reg = pc->result;
1781
1782	if (reg) {
1783		for (c = 0; c < 4; c++) {
1784			if (!(mask & (1 << c)))
1785				continue;
1786			reg[dst->Index * 4 + c].acc = pc->insn_nr;
1787		}
1788	}
1789
1790	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1791		src = &insn->FullSrcRegisters[i];
1792
1793		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
1794			reg = pc->temp;
1795		else
1796		if (src->SrcRegister.File == TGSI_FILE_INPUT)
1797			reg = pc->attr;
1798		else
1799			continue;
1800
1801		mask = nv50_tgsi_src_mask(insn, i);
1802
1803		for (c = 0; c < 4; c++) {
1804			if (!(mask & (1 << c)))
1805				continue;
1806			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1807
1808			if (k > TGSI_EXTSWIZZLE_W)
1809				continue;
1810
1811			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
1812		}
1813	}
1814}
1815
1816/* Returns a bitmask indicating which dst components need to be
1817 * written to temporaries first to avoid 'corrupting' sources.
1818 *
1819 * m[i]   (out) indicate component to write in the i-th position
1820 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1821 */
1822static unsigned
1823nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1824{
1825	unsigned i, c, x, unsafe;
1826
1827	for (c = 0; c < 4; c++)
1828		m[c] = c;
1829
1830	/* Swap as long as a dst component written earlier is depended on
1831	 * by one written later, but the next one isn't depended on by it.
1832	 */
1833	for (c = 0; c < 3; c++) {
1834		if (rdep[m[c + 1]] & (1 << m[c]))
1835			continue; /* if next one is depended on by us */
1836		for (i = c + 1; i < 4; i++)
1837			/* if we are depended on by a later one */
1838			if (rdep[m[c]] & (1 << m[i]))
1839				break;
1840		if (i == 4)
1841			continue;
1842		/* now, swap */
1843		x = m[c];
1844		m[c] = m[c + 1];
1845		m[c + 1] = x;
1846
1847		/* restart */
1848		c = 0;
1849	}
1850
1851	/* mark dependencies that could not be resolved by reordering */
1852	for (i = 0; i < 3; ++i)
1853		for (c = i + 1; c < 4; ++c)
1854			if (rdep[m[i]] & (1 << m[c]))
1855				unsafe |= (1 << i);
1856
1857	/* NOTE: $unsafe is with respect to order, not component */
1858	return unsafe;
1859}
1860
1861/* Select a suitable dst register for broadcasting scalar results,
1862 * or return NULL if we have to allocate an extra TEMP.
1863 *
1864 * If e.g. only 1 component is written, we may also emit the final
1865 * result to a write-only register.
1866 */
1867static struct nv50_reg *
1868tgsi_broadcast_dst(struct nv50_pc *pc,
1869		   const struct tgsi_full_dst_register *fd, unsigned mask)
1870{
1871	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1872		int c = ffs(~mask & fd->DstRegister.WriteMask);
1873		if (c)
1874			return tgsi_dst(pc, c - 1, fd);
1875	} else {
1876		int c = ffs(fd->DstRegister.WriteMask) - 1;
1877		if ((1 << c) == fd->DstRegister.WriteMask)
1878			return tgsi_dst(pc, c, fd);
1879	}
1880
1881	return NULL;
1882}
1883
1884/* Scan source swizzles and return a bitmask indicating dst regs that
1885 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1886 */
1887static unsigned
1888nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1889		       unsigned rdep[4])
1890{
1891	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1892	const struct tgsi_full_src_register *fs;
1893	unsigned i, deqs = 0;
1894
1895	for (i = 0; i < 4; ++i)
1896		rdep[i] = 0;
1897
1898	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1899		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1900		boolean neg_supp = negate_supported(insn, i);
1901
1902		fs = &insn->FullSrcRegisters[i];
1903		if (fs->SrcRegister.File != fd->DstRegister.File ||
1904		    fs->SrcRegister.Index != fd->DstRegister.Index)
1905			continue;
1906
1907		for (chn = 0; chn < 4; ++chn) {
1908			unsigned s, c;
1909
1910			if (!(mask & (1 << chn))) /* src is not read */
1911				continue;
1912			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1913			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1914
1915			if (c > TGSI_EXTSWIZZLE_W ||
1916			    !(fd->DstRegister.WriteMask & (1 << c)))
1917				continue;
1918
1919			/* no danger if src is copied to TEMP first */
1920			if ((s != TGSI_UTIL_SIGN_KEEP) &&
1921			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1922				continue;
1923
1924			rdep[c] |= nv50_tgsi_dst_revdep(
1925				insn->Instruction.Opcode, i, chn);
1926			deqs |= (1 << c);
1927		}
1928	}
1929
1930	return deqs;
1931}
1932
1933static boolean
1934nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1935{
1936	struct tgsi_full_instruction insn = tok->FullInstruction;
1937	const struct tgsi_full_dst_register *fd;
1938	unsigned i, deqs, rdep[4], m[4];
1939
1940	fd = &tok->FullInstruction.FullDstRegisters[0];
1941	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
1942
1943	if (is_scalar_op(insn.Instruction.Opcode)) {
1944		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
1945		if (!pc->r_brdc)
1946			pc->r_brdc = temp_temp(pc);
1947		return nv50_program_tx_insn(pc, &insn);
1948	}
1949	pc->r_brdc = NULL;
1950
1951	if (!deqs)
1952		return nv50_program_tx_insn(pc, &insn);
1953
1954	deqs = nv50_revdep_reorder(m, rdep);
1955
1956	for (i = 0; i < 4; ++i) {
1957		assert(pc->r_dst[m[i]] == NULL);
1958
1959		insn.FullDstRegisters[0].DstRegister.WriteMask =
1960			fd->DstRegister.WriteMask & (1 << m[i]);
1961
1962		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
1963			continue;
1964
1965		if (deqs & (1 << i))
1966			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
1967
1968		if (!nv50_program_tx_insn(pc, &insn))
1969			return FALSE;
1970	}
1971
1972	for (i = 0; i < 4; i++) {
1973		struct nv50_reg *reg = pc->r_dst[i];
1974		if (!reg)
1975			continue;
1976		pc->r_dst[i] = NULL;
1977
1978		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
1979			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
1980		else
1981			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
1982		free_temp(pc, reg);
1983	}
1984
1985	return TRUE;
1986}
1987
1988static void
1989load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
1990{
1991	struct nv50_reg *iv, **ppiv;
1992	unsigned mode = pc->interp_mode[reg->index];
1993
1994	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
1995	iv = *ppiv;
1996
1997	if ((mode & INTERP_PERSPECTIVE) && !iv) {
1998		iv = *ppiv = alloc_temp(pc, NULL);
1999		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2000
2001		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2002		emit_flop(pc, 0, iv, iv);
2003
2004		/* XXX: when loading interpolants dynamically, move these
2005		 * to the program head, or make sure it can't be skipped.
2006		 */
2007	}
2008
2009	emit_interp(pc, reg, iv, mode);
2010}
2011
2012static boolean
2013nv50_program_tx_prep(struct nv50_pc *pc)
2014{
2015	struct tgsi_parse_context tp;
2016	struct nv50_program *p = pc->p;
2017	boolean ret = FALSE;
2018	unsigned i, c, flat_nr = 0;
2019
2020	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2021	while (!tgsi_parse_end_of_tokens(&tp)) {
2022		const union tgsi_full_token *tok = &tp.FullToken;
2023
2024		tgsi_parse_token(&tp);
2025		switch (tok->Token.Type) {
2026		case TGSI_TOKEN_TYPE_IMMEDIATE:
2027		{
2028			const struct tgsi_full_immediate *imm =
2029				&tp.FullToken.FullImmediate;
2030
2031			ctor_immd(pc, imm->u[0].Float,
2032				      imm->u[1].Float,
2033				      imm->u[2].Float,
2034				      imm->u[3].Float);
2035		}
2036			break;
2037		case TGSI_TOKEN_TYPE_DECLARATION:
2038		{
2039			const struct tgsi_full_declaration *d;
2040			unsigned si, last, first, mode;
2041
2042			d = &tp.FullToken.FullDeclaration;
2043			first = d->DeclarationRange.First;
2044			last = d->DeclarationRange.Last;
2045
2046			switch (d->Declaration.File) {
2047			case TGSI_FILE_TEMPORARY:
2048				break;
2049			case TGSI_FILE_OUTPUT:
2050				if (!d->Declaration.Semantic ||
2051				    p->type == PIPE_SHADER_FRAGMENT)
2052					break;
2053
2054				si = d->Semantic.SemanticIndex;
2055				switch (d->Semantic.SemanticName) {
2056				case TGSI_SEMANTIC_BCOLOR:
2057					p->cfg.two_side[si].hw = first;
2058					if (p->cfg.io_nr > first)
2059						p->cfg.io_nr = first;
2060					break;
2061				case TGSI_SEMANTIC_PSIZE:
2062					p->cfg.psiz = first;
2063					if (p->cfg.io_nr > first)
2064						p->cfg.io_nr = first;
2065					break;
2066					/*
2067				case TGSI_SEMANTIC_CLIP_DISTANCE:
2068					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2069					break;
2070					*/
2071				default:
2072					break;
2073				}
2074				break;
2075			case TGSI_FILE_INPUT:
2076			{
2077				if (p->type != PIPE_SHADER_FRAGMENT)
2078					break;
2079
2080				switch (d->Declaration.Interpolate) {
2081				case TGSI_INTERPOLATE_CONSTANT:
2082					mode = INTERP_FLAT;
2083					flat_nr++;
2084					break;
2085				case TGSI_INTERPOLATE_PERSPECTIVE:
2086					mode = INTERP_PERSPECTIVE;
2087					p->cfg.regs[1] |= 0x08 << 24;
2088					break;
2089				default:
2090					mode = INTERP_LINEAR;
2091					break;
2092				}
2093				if (d->Declaration.Centroid)
2094					mode |= INTERP_CENTROID;
2095
2096				assert(last < 32);
2097				for (i = first; i <= last; i++)
2098					pc->interp_mode[i] = mode;
2099			}
2100				break;
2101			case TGSI_FILE_CONSTANT:
2102				break;
2103			case TGSI_FILE_SAMPLER:
2104				break;
2105			default:
2106				NOUVEAU_ERR("bad decl file %d\n",
2107					    d->Declaration.File);
2108				goto out_err;
2109			}
2110		}
2111			break;
2112		case TGSI_TOKEN_TYPE_INSTRUCTION:
2113			pc->insn_nr++;
2114			prep_inspect_insn(pc, &tok->FullInstruction);
2115			break;
2116		default:
2117			break;
2118		}
2119	}
2120
2121	if (p->type == PIPE_SHADER_VERTEX) {
2122		int rid = 0;
2123
2124		for (i = 0; i < pc->attr_nr * 4; ++i) {
2125			if (pc->attr[i].acc) {
2126				pc->attr[i].hw = rid++;
2127				p->cfg.attr[i / 32] |= 1 << (i % 32);
2128			}
2129		}
2130
2131		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2132			p->cfg.io[i].hw = rid;
2133			p->cfg.io[i].id_vp = i;
2134
2135			for (c = 0; c < 4; ++c) {
2136				int n = i * 4 + c;
2137				if (!pc->result[n].acc)
2138					continue;
2139				pc->result[n].hw = rid++;
2140				p->cfg.io[i].mask |= 1 << c;
2141			}
2142		}
2143
2144		for (c = 0; c < 2; ++c)
2145			if (p->cfg.two_side[c].hw < 0x40)
2146				p->cfg.two_side[c] = p->cfg.io[
2147					p->cfg.two_side[c].hw];
2148
2149		if (p->cfg.psiz < 0x40)
2150			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2151	} else
2152	if (p->type == PIPE_SHADER_FRAGMENT) {
2153		int rid, aid;
2154		unsigned n = 0, m = pc->attr_nr - flat_nr;
2155
2156		int base = (TGSI_SEMANTIC_POSITION ==
2157			    p->info.input_semantic_name[0]) ? 0 : 1;
2158
2159		/* non-flat interpolants have to be mapped to
2160		 * the lower hardware IDs, so sort them:
2161		 */
2162		for (i = 0; i < pc->attr_nr; i++) {
2163			if (pc->interp_mode[i] == INTERP_FLAT) {
2164				p->cfg.io[m].id_vp = i + base;
2165				p->cfg.io[m++].id_fp = i;
2166			} else {
2167				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2168					p->cfg.io[n].linear = TRUE;
2169				p->cfg.io[n].id_vp = i + base;
2170				p->cfg.io[n++].id_fp = i;
2171			}
2172		}
2173
2174		if (!base) /* set w-coordinate mask from perspective interp */
2175			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2176
2177		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2178			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2179
2180		for (n = 0; n < pc->attr_nr; ++n) {
2181			p->cfg.io[n].hw = rid = aid;
2182			i = p->cfg.io[n].id_fp;
2183
2184			for (c = 0; c < 4; ++c) {
2185				if (!pc->attr[i * 4 + c].acc)
2186					continue;
2187				pc->attr[i * 4 + c].rhw = rid++;
2188				p->cfg.io[n].mask |= 1 << c;
2189
2190				load_interpolant(pc, &pc->attr[i * 4 + c]);
2191			}
2192			aid += popcnt4(p->cfg.io[n].mask);
2193		}
2194
2195		if (!base)
2196			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2197
2198		m = popcnt4(p->cfg.regs[1] >> 24);
2199
2200		/* set count of non-position inputs and of non-flat
2201		 * non-position inputs for FP_INTERPOLANT_CTRL
2202		 */
2203		p->cfg.regs[1] |= aid - m;
2204
2205		if (flat_nr) {
2206			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2207			p->cfg.regs[1] |= (i - m) << 16;
2208		} else
2209			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2210
2211		/* mark color semantic for light-twoside */
2212		n = 0x40;
2213		for (i = 0; i < pc->attr_nr; i++) {
2214			ubyte si, sn;
2215
2216			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2217			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2218
2219			if (sn == TGSI_SEMANTIC_COLOR) {
2220				p->cfg.two_side[si] = p->cfg.io[i];
2221
2222				/* increase colour count */
2223				p->cfg.regs[0] += popcnt4(
2224					p->cfg.two_side[si].mask) << 16;
2225
2226				n = MIN2(n, p->cfg.io[i].hw - m);
2227			}
2228		}
2229		if (n < 0x40)
2230			p->cfg.regs[0] += n;
2231
2232		/* Initialize FP results:
2233		 * FragDepth is always first TGSI and last hw output
2234		 */
2235		i = p->info.writes_z ? 4 : 0;
2236		for (rid = 0; i < pc->result_nr * 4; i++)
2237			pc->result[i].rhw = rid++;
2238		if (p->info.writes_z)
2239			pc->result[2].rhw = rid;
2240	}
2241
2242	if (pc->immd_nr) {
2243		int rid = 0;
2244
2245		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2246		if (!pc->immd)
2247			goto out_err;
2248
2249		for (i = 0; i < pc->immd_nr; i++) {
2250			for (c = 0; c < 4; c++, rid++)
2251				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2252		}
2253	}
2254
2255	ret = TRUE;
2256out_err:
2257	if (pc->iv_p)
2258		free_temp(pc, pc->iv_p);
2259	if (pc->iv_c)
2260		free_temp(pc, pc->iv_c);
2261
2262	tgsi_parse_free(&tp);
2263	return ret;
2264}
2265
2266static void
2267free_nv50_pc(struct nv50_pc *pc)
2268{
2269	if (pc->immd)
2270		FREE(pc->immd);
2271	if (pc->param)
2272		FREE(pc->param);
2273	if (pc->result)
2274		FREE(pc->result);
2275	if (pc->attr)
2276		FREE(pc->attr);
2277	if (pc->temp)
2278		FREE(pc->temp);
2279
2280	FREE(pc);
2281}
2282
2283static boolean
2284ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2285{
2286	int i, c;
2287	unsigned rtype[2] = { P_ATTR, P_RESULT };
2288
2289	pc->p = p;
2290	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2291	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2292	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2293	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2294
2295	p->cfg.high_temp = 4;
2296
2297	p->cfg.two_side[0].hw = 0x40;
2298	p->cfg.two_side[1].hw = 0x40;
2299
2300	switch (p->type) {
2301	case PIPE_SHADER_VERTEX:
2302		p->cfg.psiz = 0x40;
2303		p->cfg.clpd = 0x40;
2304		p->cfg.io_nr = pc->result_nr;
2305		break;
2306	case PIPE_SHADER_FRAGMENT:
2307		rtype[0] = rtype[1] = P_TEMP;
2308
2309		p->cfg.regs[0] = 0x01000004;
2310		p->cfg.io_nr = pc->attr_nr;
2311
2312		if (p->info.writes_z) {
2313			p->cfg.regs[2] |= 0x00000100;
2314			p->cfg.regs[3] |= 0x00000011;
2315		}
2316		if (p->info.uses_kill)
2317			p->cfg.regs[2] |= 0x00100000;
2318		break;
2319	}
2320
2321	if (pc->temp_nr) {
2322		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2323		if (!pc->temp)
2324			return FALSE;
2325
2326		for (i = 0; i < pc->temp_nr * 4; ++i)
2327			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2328	}
2329
2330	if (pc->attr_nr) {
2331		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2332		if (!pc->attr)
2333			return FALSE;
2334
2335		for (i = 0; i < pc->attr_nr * 4; ++i)
2336			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2337	}
2338
2339	if (pc->result_nr) {
2340		unsigned nr = pc->result_nr * 4;
2341
2342		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2343		if (!pc->result)
2344			return FALSE;
2345
2346		for (i = 0; i < nr; ++i)
2347			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2348	}
2349
2350	if (pc->param_nr) {
2351		int rid = 0;
2352
2353		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2354		if (!pc->param)
2355			return FALSE;
2356
2357		for (i = 0; i < pc->param_nr; ++i)
2358			for (c = 0; c < 4; ++c, ++rid)
2359				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2360	}
2361
2362	return TRUE;
2363}
2364
2365static boolean
2366nv50_program_tx(struct nv50_program *p)
2367{
2368	struct tgsi_parse_context parse;
2369	struct nv50_pc *pc;
2370	unsigned k;
2371	boolean ret;
2372
2373	pc = CALLOC_STRUCT(nv50_pc);
2374	if (!pc)
2375		return FALSE;
2376
2377	ret = ctor_nv50_pc(pc, p);
2378	if (ret == FALSE)
2379		goto out_cleanup;
2380
2381	ret = nv50_program_tx_prep(pc);
2382	if (ret == FALSE)
2383		goto out_cleanup;
2384
2385	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2386	while (!tgsi_parse_end_of_tokens(&parse)) {
2387		const union tgsi_full_token *tok = &parse.FullToken;
2388
2389		/* don't allow half insn/immd on first and last instruction */
2390		pc->allow32 = TRUE;
2391		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2392			pc->allow32 = FALSE;
2393
2394		tgsi_parse_token(&parse);
2395
2396		switch (tok->Token.Type) {
2397		case TGSI_TOKEN_TYPE_INSTRUCTION:
2398			++pc->insn_cur;
2399			ret = nv50_tgsi_insn(pc, tok);
2400			if (ret == FALSE)
2401				goto out_err;
2402			break;
2403		default:
2404			break;
2405		}
2406	}
2407
2408	if (p->type == PIPE_SHADER_FRAGMENT) {
2409		struct nv50_reg out;
2410		ctor_reg(&out, P_TEMP, -1, -1);
2411
2412		for (k = 0; k < pc->result_nr * 4; k++) {
2413			if (pc->result[k].rhw == -1)
2414				continue;
2415			if (pc->result[k].hw != pc->result[k].rhw) {
2416				out.hw = pc->result[k].rhw;
2417				emit_mov(pc, &out, &pc->result[k]);
2418			}
2419			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2420				pc->p->cfg.high_result = pc->result[k].rhw + 1;
2421		}
2422	}
2423
2424	/* look for single half instructions and make them long */
2425	struct nv50_program_exec *e, *e_prev;
2426
2427	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2428		if (!is_long(e))
2429			k++;
2430
2431		if (!e->next || is_long(e->next)) {
2432			if (k & 1)
2433				convert_to_long(pc, e);
2434			k = 0;
2435		}
2436
2437		if (e->next)
2438			e_prev = e;
2439	}
2440
2441	if (!is_long(pc->p->exec_tail)) {
2442		/* this may occur if moving FP results */
2443		assert(e_prev && !is_long(e_prev));
2444		convert_to_long(pc, e_prev);
2445		convert_to_long(pc, pc->p->exec_tail);
2446	}
2447
2448	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2449	pc->p->exec_tail->inst[1] |= 0x00000001;
2450
2451	p->param_nr = pc->param_nr * 4;
2452	p->immd_nr = pc->immd_nr * 4;
2453	p->immd = pc->immd_buf;
2454
2455out_err:
2456	tgsi_parse_free(&parse);
2457
2458out_cleanup:
2459	free_nv50_pc(pc);
2460	return ret;
2461}
2462
2463static void
2464nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2465{
2466	if (nv50_program_tx(p) == FALSE)
2467		assert(0);
2468	p->translated = TRUE;
2469}
2470
2471static void
2472nv50_program_upload_data(struct nv50_context *nv50, float *map,
2473			unsigned start, unsigned count, unsigned cbuf)
2474{
2475	struct nouveau_channel *chan = nv50->screen->base.channel;
2476	struct nouveau_grobj *tesla = nv50->screen->tesla;
2477
2478	while (count) {
2479		unsigned nr = count > 2047 ? 2047 : count;
2480
2481		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2482		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2483		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2484		OUT_RINGp (chan, map, nr);
2485
2486		map += nr;
2487		start += nr;
2488		count -= nr;
2489	}
2490}
2491
2492static void
2493nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2494{
2495	struct pipe_screen *pscreen = nv50->pipe.screen;
2496
2497	if (!p->data[0] && p->immd_nr) {
2498		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2499
2500		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2501			while (heap->next && heap->size < p->immd_nr) {
2502				struct nv50_program *evict = heap->next->priv;
2503				nouveau_resource_free(&evict->data[0]);
2504			}
2505
2506			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2507						   &p->data[0]))
2508				assert(0);
2509		}
2510
2511		/* immediates only need to be uploaded again when freed */
2512		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2513					 p->immd_nr, NV50_CB_PMISC);
2514	}
2515
2516	assert(p->param_nr <= 128);
2517
2518	if (p->param_nr) {
2519		unsigned cb;
2520		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2521					     PIPE_BUFFER_USAGE_CPU_READ);
2522
2523		if (p->type == PIPE_SHADER_VERTEX)
2524			cb = NV50_CB_PVP;
2525		else
2526			cb = NV50_CB_PFP;
2527
2528		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
2529		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2530	}
2531}
2532
2533static void
2534nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2535{
2536	struct nouveau_channel *chan = nv50->screen->base.channel;
2537	struct nouveau_grobj *tesla = nv50->screen->tesla;
2538	struct nv50_program_exec *e;
2539	struct nouveau_stateobj *so;
2540	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2541	unsigned start, count, *up, *ptr;
2542	boolean upload = FALSE;
2543
2544	if (!p->bo) {
2545		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2546			       p->exec_size * 4, &p->bo);
2547		upload = TRUE;
2548	}
2549
2550	if (p->data[0] && p->data[0]->start != p->data_start[0])
2551		upload = TRUE;
2552
2553	if (!upload)
2554		return;
2555
2556	for (e = p->exec_head; e; e = e->next) {
2557		unsigned ei, ci, bs;
2558
2559		if (e->param.index < 0)
2560			continue;
2561		bs = (e->inst[1] >> 22) & 0x07;
2562		assert(bs < 2);
2563		ei = e->param.shift >> 5;
2564		ci = e->param.index;
2565		if (bs == 0)
2566			ci += p->data[bs]->start;
2567
2568		e->inst[ei] &= ~e->param.mask;
2569		e->inst[ei] |= (ci << e->param.shift);
2570	}
2571
2572	if (p->data[0])
2573		p->data_start[0] = p->data[0]->start;
2574
2575#ifdef NV50_PROGRAM_DUMP
2576	NOUVEAU_ERR("-------\n");
2577	for (e = p->exec_head; e; e = e->next) {
2578		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2579		if (is_long(e))
2580			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2581	}
2582#endif
2583
2584	up = ptr = MALLOC(p->exec_size * 4);
2585	for (e = p->exec_head; e; e = e->next) {
2586		*(ptr++) = e->inst[0];
2587		if (is_long(e))
2588			*(ptr++) = e->inst[1];
2589	}
2590
2591	so = so_new(4,2);
2592	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2593	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2594	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2595	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2596
2597	start = 0; count = p->exec_size;
2598	while (count) {
2599		struct nouveau_channel *chan = nv50->screen->base.channel;
2600		unsigned nr;
2601
2602		so_emit(chan, so);
2603
2604		nr = MIN2(count, 2047);
2605		nr = MIN2(chan->pushbuf->remaining, nr);
2606		if (chan->pushbuf->remaining < (nr + 3)) {
2607			FIRE_RING(chan);
2608			continue;
2609		}
2610
2611		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2612		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2613		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2614		OUT_RINGp (chan, up + start, nr);
2615
2616		start += nr;
2617		count -= nr;
2618	}
2619
2620	FREE(up);
2621	so_ref(NULL, &so);
2622}
2623
2624void
2625nv50_vertprog_validate(struct nv50_context *nv50)
2626{
2627	struct nouveau_grobj *tesla = nv50->screen->tesla;
2628	struct nv50_program *p = nv50->vertprog;
2629	struct nouveau_stateobj *so;
2630
2631	if (!p->translated) {
2632		nv50_program_validate(nv50, p);
2633		if (!p->translated)
2634			assert(0);
2635	}
2636
2637	nv50_program_validate_data(nv50, p);
2638	nv50_program_validate_code(nv50, p);
2639
2640	so = so_new(13, 2);
2641	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2642	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2643		      NOUVEAU_BO_HIGH, 0, 0);
2644	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2645		      NOUVEAU_BO_LOW, 0, 0);
2646	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2647	so_data  (so, p->cfg.attr[0]);
2648	so_data  (so, p->cfg.attr[1]);
2649	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2650	so_data  (so, p->cfg.high_result);
2651	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2652	so_data  (so, p->cfg.high_result); //8);
2653	so_data  (so, p->cfg.high_temp);
2654	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2655	so_data  (so, 0); /* program start offset */
2656	so_ref(so, &nv50->state.vertprog);
2657	so_ref(NULL, &so);
2658}
2659
2660void
2661nv50_fragprog_validate(struct nv50_context *nv50)
2662{
2663	struct nouveau_grobj *tesla = nv50->screen->tesla;
2664	struct nv50_program *p = nv50->fragprog;
2665	struct nouveau_stateobj *so;
2666
2667	if (!p->translated) {
2668		nv50_program_validate(nv50, p);
2669		if (!p->translated)
2670			assert(0);
2671	}
2672
2673	nv50_program_validate_data(nv50, p);
2674	nv50_program_validate_code(nv50, p);
2675
2676	so = so_new(64, 2);
2677	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2678	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2679		      NOUVEAU_BO_HIGH, 0, 0);
2680	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2681		      NOUVEAU_BO_LOW, 0, 0);
2682	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2683	so_data  (so, p->cfg.high_temp);
2684	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2685	so_data  (so, p->cfg.high_result);
2686	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2687	so_data  (so, p->cfg.regs[2]);
2688	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2689	so_data  (so, p->cfg.regs[3]);
2690	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2691	so_data  (so, 0); /* program start offset */
2692	so_ref(so, &nv50->state.fragprog);
2693	so_ref(NULL, &so);
2694}
2695
2696static void
2697nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
2698{
2699	struct nv50_program *fp = nv50->fragprog;
2700	struct nv50_program *vp = nv50->vertprog;
2701	unsigned i, c, m = base;
2702
2703	/* XXX: This can't work correctly in all cases yet, we either
2704	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
2705	 * to be per FP input instead of per VP output
2706	 */
2707	memset(pntc, 0, 8 * sizeof(uint32_t));
2708
2709	for (i = 0; i < fp->cfg.io_nr; i++) {
2710		uint8_t sn, si;
2711		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
2712		unsigned n = popcnt4(fp->cfg.io[i].mask);
2713
2714		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
2715			m += n;
2716			continue;
2717		}
2718
2719		sn = vp->info.input_semantic_name[j];
2720		si = vp->info.input_semantic_index[j];
2721
2722		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
2723			ubyte mode =
2724				nv50->rasterizer->pipe.sprite_coord_mode[si];
2725
2726			if (mode == PIPE_SPRITE_COORD_NONE) {
2727				m += n;
2728				continue;
2729			}
2730		}
2731
2732		/* this is either PointCoord or replaced by sprite coords */
2733		for (c = 0; c < 4; c++) {
2734			if (!(fp->cfg.io[i].mask & (1 << c)))
2735				continue;
2736			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
2737			++m;
2738		}
2739	}
2740}
2741
2742static int
2743nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
2744	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
2745{
2746	int c;
2747	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
2748	uint8_t *map = (uint8_t *)p_map;
2749
2750	for (c = 0; c < 4; ++c) {
2751		if (mf & 1) {
2752			if (fpi->linear == TRUE)
2753				lin[mid / 32] |= 1 << (mid % 32);
2754			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
2755		}
2756
2757		oid += mv & 1;
2758		mf >>= 1;
2759		mv >>= 1;
2760	}
2761
2762	return mid;
2763}
2764
2765void
2766nv50_linkage_validate(struct nv50_context *nv50)
2767{
2768	struct nouveau_grobj *tesla = nv50->screen->tesla;
2769	struct nv50_program *vp = nv50->vertprog;
2770	struct nv50_program *fp = nv50->fragprog;
2771	struct nouveau_stateobj *so;
2772	struct nv50_sreg4 dummy, *vpo;
2773	int i, n, c, m = 0;
2774	uint32_t map[16], lin[4], reg[5], pcrd[8];
2775
2776	memset(map, 0, sizeof(map));
2777	memset(lin, 0, sizeof(lin));
2778
2779	reg[1] = 0x00000004; /* low and high clip distance map ids */
2780	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
2781	reg[3] = 0x00000000; /* point size map id & enable */
2782	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
2783	reg[4] = fp->cfg.regs[1]; /* interpolant info */
2784
2785	dummy.linear = FALSE;
2786	dummy.mask = 0xf; /* map all components of HPOS */
2787	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
2788
2789	dummy.mask = 0x0;
2790
2791	if (vp->cfg.clpd < 0x40) {
2792		for (c = 0; c < vp->cfg.clpd_nr; ++c)
2793			map[m++] = vp->cfg.clpd + c;
2794		reg[1] = (m << 8);
2795	}
2796
2797	reg[0] |= m << 8; /* adjust BFC0 id */
2798
2799	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
2800	if (nv50->rasterizer->pipe.light_twoside) {
2801		vpo = &vp->cfg.two_side[0];
2802
2803		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
2804		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
2805	}
2806
2807	reg[0] += m - 4; /* adjust FFC0 id */
2808	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
2809
2810	i = 0;
2811	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
2812		i = 1;
2813	for (; i < fp->cfg.io_nr; i++) {
2814		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
2815		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
2816
2817		n = fp->cfg.io[i].id_vp;
2818		if (n >= vp->cfg.io_nr ||
2819		    vp->info.output_semantic_name[n] != sn ||
2820		    vp->info.output_semantic_index[n] != si)
2821			vpo = &dummy;
2822		else
2823			vpo = &vp->cfg.io[n];
2824
2825		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
2826	}
2827
2828	if (nv50->rasterizer->pipe.point_size_per_vertex) {
2829		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
2830		reg[3] = (m++ << 4) | 1;
2831	}
2832
2833	/* now fill the stateobj */
2834	so = so_new(64, 0);
2835
2836	n = (m + 3) / 4;
2837	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
2838	so_data  (so, m);
2839	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
2840	so_datap (so, map, n);
2841
2842	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2843	so_datap (so, reg, 4);
2844
2845	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
2846	so_data  (so, reg[4]);
2847
2848	so_method(so, tesla, 0x1540, 4);
2849	so_datap (so, lin, 4);
2850
2851	if (nv50->rasterizer->pipe.point_sprite) {
2852		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
2853
2854		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
2855		so_datap (so, pcrd, 8);
2856	}
2857
2858        so_ref(so, &nv50->state.programs);
2859        so_ref(NULL, &so);
2860}
2861
2862void
2863nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2864{
2865	while (p->exec_head) {
2866		struct nv50_program_exec *e = p->exec_head;
2867
2868		p->exec_head = e->next;
2869		FREE(e);
2870	}
2871	p->exec_tail = NULL;
2872	p->exec_size = 0;
2873
2874	nouveau_bo_ref(NULL, &p->bo);
2875
2876	nouveau_resource_free(&p->data[0]);
2877
2878	p->translated = 0;
2879}
2880