nv50_program.c revision 1196f9fbd68d9f3d1acd3d097711b382d7489f41
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93/* arbitrary limit */
94#define MAX_IF_DEPTH 4
95
96struct nv50_pc {
97	struct nv50_program *p;
98
99	/* hw resources */
100	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
101
102	/* tgsi resources */
103	struct nv50_reg *temp;
104	int temp_nr;
105	struct nv50_reg *attr;
106	int attr_nr;
107	struct nv50_reg *result;
108	int result_nr;
109	struct nv50_reg *param;
110	int param_nr;
111	struct nv50_reg *immd;
112	float *immd_buf;
113	int immd_nr;
114
115	struct nv50_reg *temp_temp[16];
116	unsigned temp_temp_nr;
117
118	/* broadcast and destination replacement regs */
119	struct nv50_reg *r_brdc;
120	struct nv50_reg *r_dst[4];
121
122	unsigned interp_mode[32];
123	/* perspective interpolation registers */
124	struct nv50_reg *iv_p;
125	struct nv50_reg *iv_c;
126
127	struct nv50_program_exec *if_cond;
128	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
129	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
130	int if_lvl;
131
132	/* current instruction and total number of insns */
133	unsigned insn_cur;
134	unsigned insn_nr;
135
136	boolean allow32;
137};
138
139static INLINE void
140ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
141{
142	reg->type = type;
143	reg->index = index;
144	reg->hw = hw;
145	reg->neg = 0;
146	reg->rhw = -1;
147	reg->acc = 0;
148}
149
150static INLINE unsigned
151popcnt4(uint32_t val)
152{
153	static const unsigned cnt[16]
154	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
155	return cnt[val & 0xf];
156}
157
158static void
159alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
160{
161	int i = 0;
162
163	if (reg->type == P_RESULT) {
164		if (pc->p->cfg.high_result < (reg->hw + 1))
165			pc->p->cfg.high_result = reg->hw + 1;
166	}
167
168	if (reg->type != P_TEMP)
169		return;
170
171	if (reg->hw >= 0) {
172		/*XXX: do this here too to catch FP temp-as-attr usage..
173		 *     not clean, but works */
174		if (pc->p->cfg.high_temp < (reg->hw + 1))
175			pc->p->cfg.high_temp = reg->hw + 1;
176		return;
177	}
178
179	if (reg->rhw != -1) {
180		/* try to allocate temporary with index rhw first */
181		if (!(pc->r_temp[reg->rhw])) {
182			pc->r_temp[reg->rhw] = reg;
183			reg->hw = reg->rhw;
184			if (pc->p->cfg.high_temp < (reg->rhw + 1))
185				pc->p->cfg.high_temp = reg->rhw + 1;
186			return;
187		}
188		/* make sure we don't get things like $r0 needs to go
189		 * in $r1 and $r1 in $r0
190		 */
191		i = pc->result_nr * 4;
192	}
193
194	for (; i < NV50_SU_MAX_TEMP; i++) {
195		if (!(pc->r_temp[i])) {
196			pc->r_temp[i] = reg;
197			reg->hw = i;
198			if (pc->p->cfg.high_temp < (i + 1))
199				pc->p->cfg.high_temp = i + 1;
200			return;
201		}
202	}
203
204	assert(0);
205}
206
207static struct nv50_reg *
208alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
209{
210	struct nv50_reg *r;
211	int i;
212
213	if (dst && dst->type == P_TEMP && dst->hw == -1)
214		return dst;
215
216	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
217		if (!pc->r_temp[i]) {
218			r = MALLOC_STRUCT(nv50_reg);
219			ctor_reg(r, P_TEMP, -1, i);
220			pc->r_temp[i] = r;
221			return r;
222		}
223	}
224
225	assert(0);
226	return NULL;
227}
228
229/* Assign the hw of the discarded temporary register src
230 * to the tgsi register dst and free src.
231 */
232static void
233assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
234{
235	assert(src->index == -1 && src->hw != -1);
236
237	if (dst->hw != -1)
238		pc->r_temp[dst->hw] = NULL;
239	pc->r_temp[src->hw] = dst;
240	dst->hw = src->hw;
241
242	FREE(src);
243}
244
245/* release the hardware resource held by r */
246static void
247release_hw(struct nv50_pc *pc, struct nv50_reg *r)
248{
249	assert(r->type == P_TEMP);
250	if (r->hw == -1)
251		return;
252
253	assert(pc->r_temp[r->hw] == r);
254	pc->r_temp[r->hw] = NULL;
255
256	r->acc = 0;
257	if (r->index == -1)
258		FREE(r);
259}
260
261static void
262free_temp(struct nv50_pc *pc, struct nv50_reg *r)
263{
264	if (r->index == -1) {
265		unsigned hw = r->hw;
266
267		FREE(pc->r_temp[hw]);
268		pc->r_temp[hw] = NULL;
269	}
270}
271
272static int
273alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
274{
275	int i;
276
277	if ((idx + 4) >= NV50_SU_MAX_TEMP)
278		return 1;
279
280	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
281	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
282		return alloc_temp4(pc, dst, idx + 4);
283
284	for (i = 0; i < 4; i++) {
285		dst[i] = MALLOC_STRUCT(nv50_reg);
286		ctor_reg(dst[i], P_TEMP, -1, idx + i);
287		pc->r_temp[idx + i] = dst[i];
288	}
289
290	return 0;
291}
292
293static void
294free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
295{
296	int i;
297
298	for (i = 0; i < 4; i++)
299		free_temp(pc, reg[i]);
300}
301
302static struct nv50_reg *
303temp_temp(struct nv50_pc *pc)
304{
305	if (pc->temp_temp_nr >= 16)
306		assert(0);
307
308	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
309	return pc->temp_temp[pc->temp_temp_nr++];
310}
311
312static void
313kill_temp_temp(struct nv50_pc *pc)
314{
315	int i;
316
317	for (i = 0; i < pc->temp_temp_nr; i++)
318		free_temp(pc, pc->temp_temp[i]);
319	pc->temp_temp_nr = 0;
320}
321
322static int
323ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
324{
325	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
326			       (pc->immd_nr + 1) * 4 * sizeof(float));
327	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
328	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
329	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
330	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
331
332	return pc->immd_nr++;
333}
334
335static struct nv50_reg *
336alloc_immd(struct nv50_pc *pc, float f)
337{
338	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
339	unsigned hw;
340
341	for (hw = 0; hw < pc->immd_nr * 4; hw++)
342		if (pc->immd_buf[hw] == f)
343			break;
344
345	if (hw == pc->immd_nr * 4)
346		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
347
348	ctor_reg(r, P_IMMD, -1, hw);
349	return r;
350}
351
352static struct nv50_program_exec *
353exec(struct nv50_pc *pc)
354{
355	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
356
357	e->param.index = -1;
358	return e;
359}
360
361static void
362emit(struct nv50_pc *pc, struct nv50_program_exec *e)
363{
364	struct nv50_program *p = pc->p;
365
366	if (p->exec_tail)
367		p->exec_tail->next = e;
368	if (!p->exec_head)
369		p->exec_head = e;
370	p->exec_tail = e;
371	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
372}
373
374static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
375
376static boolean
377is_long(struct nv50_program_exec *e)
378{
379	if (e->inst[0] & 1)
380		return TRUE;
381	return FALSE;
382}
383
384static boolean
385is_immd(struct nv50_program_exec *e)
386{
387	if (is_long(e) && (e->inst[1] & 3) == 3)
388		return TRUE;
389	return FALSE;
390}
391
392static INLINE void
393set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
394	 struct nv50_program_exec *e)
395{
396	set_long(pc, e);
397	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
398	e->inst[1] |= (pred << 7) | (idx << 12);
399}
400
401static INLINE void
402set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
403	    struct nv50_program_exec *e)
404{
405	set_long(pc, e);
406	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
407	e->inst[1] |= (idx << 4) | (on << 6);
408}
409
410static INLINE void
411set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
412{
413	if (is_long(e))
414		return;
415
416	e->inst[0] |= 1;
417	set_pred(pc, 0xf, 0, e);
418	set_pred_wr(pc, 0, 0, e);
419}
420
421static INLINE void
422set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
423{
424	if (dst->type == P_RESULT) {
425		set_long(pc, e);
426		e->inst[1] |= 0x00000008;
427	}
428
429	alloc_reg(pc, dst);
430	e->inst[0] |= (dst->hw << 2);
431}
432
433static INLINE void
434set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
435{
436	float f = pc->immd_buf[imm->hw];
437	unsigned val = fui(imm->neg ? -f : f);
438
439	set_long(pc, e);
440	/*XXX: can't be predicated - bits overlap.. catch cases where both
441	 *     are required and avoid them. */
442	set_pred(pc, 0, 0, e);
443	set_pred_wr(pc, 0, 0, e);
444
445	e->inst[1] |= 0x00000002 | 0x00000001;
446	e->inst[0] |= (val & 0x3f) << 16;
447	e->inst[1] |= (val >> 6) << 2;
448}
449
450
451#define INTERP_LINEAR		0
452#define INTERP_FLAT			1
453#define INTERP_PERSPECTIVE	2
454#define INTERP_CENTROID		4
455
456/* interpolant index has been stored in dst->rhw */
457static void
458emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
459		unsigned mode)
460{
461	assert(dst->rhw != -1);
462	struct nv50_program_exec *e = exec(pc);
463
464	e->inst[0] |= 0x80000000;
465	set_dst(pc, dst, e);
466	e->inst[0] |= (dst->rhw << 16);
467
468	if (mode & INTERP_FLAT) {
469		e->inst[0] |= (1 << 8);
470	} else {
471		if (mode & INTERP_PERSPECTIVE) {
472			e->inst[0] |= (1 << 25);
473			alloc_reg(pc, iv);
474			e->inst[0] |= (iv->hw << 9);
475		}
476
477		if (mode & INTERP_CENTROID)
478			e->inst[0] |= (1 << 24);
479	}
480
481	emit(pc, e);
482}
483
484static void
485set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
486	 struct nv50_program_exec *e)
487{
488	set_long(pc, e);
489
490	e->param.index = src->hw;
491	e->param.shift = s;
492	e->param.mask = m << (s % 32);
493
494	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
495}
496
497static void
498emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
499{
500	struct nv50_program_exec *e = exec(pc);
501
502	e->inst[0] |= 0x10000000;
503
504	set_dst(pc, dst, e);
505
506	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
507		set_immd(pc, src, e);
508		/*XXX: 32-bit, but steals part of "half" reg space - need to
509		 *     catch and handle this case if/when we do half-regs
510		 */
511	} else
512	if (src->type == P_IMMD || src->type == P_CONST) {
513		set_long(pc, e);
514		set_data(pc, src, 0x7f, 9, e);
515		e->inst[1] |= 0x20000000; /* src0 const? */
516	} else {
517		if (src->type == P_ATTR) {
518			set_long(pc, e);
519			e->inst[1] |= 0x00200000;
520		}
521
522		alloc_reg(pc, src);
523		e->inst[0] |= (src->hw << 9);
524	}
525
526	if (is_long(e) && !is_immd(e)) {
527		e->inst[1] |= 0x04000000; /* 32-bit */
528		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
529		if (!(e->inst[1] & 0x20000000))
530			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
531	} else
532		e->inst[0] |= 0x00008000;
533
534	emit(pc, e);
535}
536
537static INLINE void
538emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
539{
540	struct nv50_reg *imm = alloc_immd(pc, f);
541	emit_mov(pc, dst, imm);
542	FREE(imm);
543}
544
545static boolean
546check_swap_src_0_1(struct nv50_pc *pc,
547		   struct nv50_reg **s0, struct nv50_reg **s1)
548{
549	struct nv50_reg *src0 = *s0, *src1 = *s1;
550
551	if (src0->type == P_CONST) {
552		if (src1->type != P_CONST) {
553			*s0 = src1;
554			*s1 = src0;
555			return TRUE;
556		}
557	} else
558	if (src1->type == P_ATTR) {
559		if (src0->type != P_ATTR) {
560			*s0 = src1;
561			*s1 = src0;
562			return TRUE;
563		}
564	}
565
566	return FALSE;
567}
568
569static void
570set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
571{
572	if (src->type == P_ATTR) {
573		set_long(pc, e);
574		e->inst[1] |= 0x00200000;
575	} else
576	if (src->type == P_CONST || src->type == P_IMMD) {
577		struct nv50_reg *temp = temp_temp(pc);
578
579		emit_mov(pc, temp, src);
580		src = temp;
581	}
582
583	alloc_reg(pc, src);
584	e->inst[0] |= (src->hw << 9);
585}
586
587static void
588set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
589{
590	if (src->type == P_ATTR) {
591		struct nv50_reg *temp = temp_temp(pc);
592
593		emit_mov(pc, temp, src);
594		src = temp;
595	} else
596	if (src->type == P_CONST || src->type == P_IMMD) {
597		assert(!(e->inst[0] & 0x00800000));
598		if (e->inst[0] & 0x01000000) {
599			struct nv50_reg *temp = temp_temp(pc);
600
601			emit_mov(pc, temp, src);
602			src = temp;
603		} else {
604			set_data(pc, src, 0x7f, 16, e);
605			e->inst[0] |= 0x00800000;
606		}
607	}
608
609	alloc_reg(pc, src);
610	e->inst[0] |= (src->hw << 16);
611}
612
613static void
614set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
615{
616	set_long(pc, e);
617
618	if (src->type == P_ATTR) {
619		struct nv50_reg *temp = temp_temp(pc);
620
621		emit_mov(pc, temp, src);
622		src = temp;
623	} else
624	if (src->type == P_CONST || src->type == P_IMMD) {
625		assert(!(e->inst[0] & 0x01000000));
626		if (e->inst[0] & 0x00800000) {
627			struct nv50_reg *temp = temp_temp(pc);
628
629			emit_mov(pc, temp, src);
630			src = temp;
631		} else {
632			set_data(pc, src, 0x7f, 32+14, e);
633			e->inst[0] |= 0x01000000;
634		}
635	}
636
637	alloc_reg(pc, src);
638	e->inst[1] |= (src->hw << 14);
639}
640
641static void
642emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
643	 struct nv50_reg *src1)
644{
645	struct nv50_program_exec *e = exec(pc);
646
647	e->inst[0] |= 0xc0000000;
648
649	if (!pc->allow32)
650		set_long(pc, e);
651
652	check_swap_src_0_1(pc, &src0, &src1);
653	set_dst(pc, dst, e);
654	set_src_0(pc, src0, e);
655	if (src1->type == P_IMMD && !is_long(e)) {
656		if (src0->neg)
657			e->inst[0] |= 0x00008000;
658		set_immd(pc, src1, e);
659	} else {
660		set_src_1(pc, src1, e);
661		if (src0->neg ^ src1->neg) {
662			if (is_long(e))
663				e->inst[1] |= 0x08000000;
664			else
665				e->inst[0] |= 0x00008000;
666		}
667	}
668
669	emit(pc, e);
670}
671
672static void
673emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
674	 struct nv50_reg *src0, struct nv50_reg *src1)
675{
676	struct nv50_program_exec *e = exec(pc);
677
678	e->inst[0] |= 0xb0000000;
679
680	check_swap_src_0_1(pc, &src0, &src1);
681
682	if (!pc->allow32 || src0->neg || src1->neg) {
683		set_long(pc, e);
684		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
685	}
686
687	set_dst(pc, dst, e);
688	set_src_0(pc, src0, e);
689	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
690		set_src_2(pc, src1, e);
691	else
692	if (src1->type == P_IMMD)
693		set_immd(pc, src1, e);
694	else
695		set_src_1(pc, src1, e);
696
697	emit(pc, e);
698}
699
700static void
701emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
702	    struct nv50_reg *src0, struct nv50_reg *src1)
703{
704	struct nv50_program_exec *e = exec(pc);
705
706	set_long(pc, e);
707	e->inst[0] |= 0xb0000000;
708	e->inst[1] |= (sub << 29);
709
710	check_swap_src_0_1(pc, &src0, &src1);
711	set_dst(pc, dst, e);
712	set_src_0(pc, src0, e);
713	set_src_1(pc, src1, e);
714
715	emit(pc, e);
716}
717
718static INLINE void
719emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
720	 struct nv50_reg *src1)
721{
722	src1->neg ^= 1;
723	emit_add(pc, dst, src0, src1);
724	src1->neg ^= 1;
725}
726
727static void
728emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
729	 struct nv50_reg *src1, struct nv50_reg *src2)
730{
731	struct nv50_program_exec *e = exec(pc);
732
733	e->inst[0] |= 0xe0000000;
734
735	check_swap_src_0_1(pc, &src0, &src1);
736	set_dst(pc, dst, e);
737	set_src_0(pc, src0, e);
738	set_src_1(pc, src1, e);
739	set_src_2(pc, src2, e);
740
741	if (src0->neg ^ src1->neg)
742		e->inst[1] |= 0x04000000;
743	if (src2->neg)
744		e->inst[1] |= 0x08000000;
745
746	emit(pc, e);
747}
748
749static INLINE void
750emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
751	 struct nv50_reg *src1, struct nv50_reg *src2)
752{
753	src2->neg ^= 1;
754	emit_mad(pc, dst, src0, src1, src2);
755	src2->neg ^= 1;
756}
757
758static void
759emit_flop(struct nv50_pc *pc, unsigned sub,
760	  struct nv50_reg *dst, struct nv50_reg *src)
761{
762	struct nv50_program_exec *e = exec(pc);
763
764	e->inst[0] |= 0x90000000;
765	if (sub) {
766		set_long(pc, e);
767		e->inst[1] |= (sub << 29);
768	}
769
770	set_dst(pc, dst, e);
771	set_src_0(pc, src, e);
772
773	emit(pc, e);
774}
775
776static void
777emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
778{
779	struct nv50_program_exec *e = exec(pc);
780
781	e->inst[0] |= 0xb0000000;
782
783	set_dst(pc, dst, e);
784	set_src_0(pc, src, e);
785	set_long(pc, e);
786	e->inst[1] |= (6 << 29) | 0x00004000;
787
788	emit(pc, e);
789}
790
791static void
792emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
793{
794	struct nv50_program_exec *e = exec(pc);
795
796	e->inst[0] |= 0xb0000000;
797
798	set_dst(pc, dst, e);
799	set_src_0(pc, src, e);
800	set_long(pc, e);
801	e->inst[1] |= (6 << 29);
802
803	emit(pc, e);
804}
805
806#define CVTOP_RN	0x01
807#define CVTOP_FLOOR	0x03
808#define CVTOP_CEIL	0x05
809#define CVTOP_TRUNC	0x07
810#define CVTOP_SAT	0x08
811#define CVTOP_ABS	0x10
812
813/* 0x04 == 32 bit */
814/* 0x40 == dst is float */
815/* 0x80 == src is float */
816#define CVT_F32_F32 0xc4
817#define CVT_F32_S32 0x44
818#define CVT_F32_U32 0x64
819#define CVT_S32_F32 0x8c
820#define CVT_S32_S32 0x0c
821#define CVT_F32_F32_ROP 0xcc
822
823static void
824emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
825	 int wp, unsigned cvn, unsigned fmt)
826{
827	struct nv50_program_exec *e;
828
829	e = exec(pc);
830	set_long(pc, e);
831
832	e->inst[0] |= 0xa0000000;
833	e->inst[1] |= 0x00004000;
834	e->inst[1] |= (cvn << 16);
835	e->inst[1] |= (fmt << 24);
836	set_src_0(pc, src, e);
837
838	if (wp >= 0)
839		set_pred_wr(pc, 1, wp, e);
840
841	if (dst)
842		set_dst(pc, dst, e);
843	else {
844		e->inst[0] |= 0x000001fc;
845		e->inst[1] |= 0x00000008;
846	}
847
848	emit(pc, e);
849}
850
851/* nv50 Condition codes:
852 *  0x1 = LT
853 *  0x2 = EQ
854 *  0x3 = LE
855 *  0x4 = GT
856 *  0x5 = NE
857 *  0x6 = GE
858 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
859 *  0x8 = unordered bit (allows NaN)
860 */
861static void
862emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
863	 struct nv50_reg *src0, struct nv50_reg *src1)
864{
865	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
866
867	struct nv50_program_exec *e = exec(pc);
868	struct nv50_reg *rdst;
869
870	assert(ccode < 16);
871	if (check_swap_src_0_1(pc, &src0, &src1))
872		ccode = cc_swapped[ccode & 7] | (ccode & 8);
873
874	rdst = dst;
875	if (dst && dst->type != P_TEMP)
876		dst = alloc_temp(pc, NULL);
877
878	/* set.u32 */
879	set_long(pc, e);
880	e->inst[0] |= 0xb0000000;
881	e->inst[1] |= 0x60000000 | (ccode << 14);
882
883	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
884	 * that doesn't seem to match what the hw actually does
885	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
886	 */
887
888	if (wp >= 0)
889		set_pred_wr(pc, 1, wp, e);
890	if (dst)
891		set_dst(pc, dst, e);
892	else {
893		e->inst[0] |= 0x000001fc;
894		e->inst[1] |= 0x00000008;
895	}
896
897	set_src_0(pc, src0, e);
898	set_src_1(pc, src1, e);
899
900	emit(pc, e);
901	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
902
903	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
904	if (rdst)
905		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
906	if (rdst && rdst != dst)
907		free_temp(pc, dst);
908}
909
910static INLINE unsigned
911map_tgsi_setop_cc(unsigned op)
912{
913	switch (op) {
914	case TGSI_OPCODE_SLT: return 0x1;
915	case TGSI_OPCODE_SGE: return 0x6;
916	case TGSI_OPCODE_SEQ: return 0x2;
917	case TGSI_OPCODE_SGT: return 0x4;
918	case TGSI_OPCODE_SLE: return 0x3;
919	case TGSI_OPCODE_SNE: return 0xd;
920	default:
921		assert(0);
922		return 0;
923	}
924}
925
926static INLINE void
927emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
928{
929	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
930}
931
932static void
933emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
934	 struct nv50_reg *v, struct nv50_reg *e)
935{
936	struct nv50_reg *temp = alloc_temp(pc, NULL);
937
938	emit_flop(pc, 3, temp, v);
939	emit_mul(pc, temp, temp, e);
940	emit_preex2(pc, temp, temp);
941	emit_flop(pc, 6, dst, temp);
942
943	free_temp(pc, temp);
944}
945
946static INLINE void
947emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
948{
949	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
950}
951
952static INLINE void
953emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
954{
955	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
956}
957
958static void
959emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
960	 struct nv50_reg **src)
961{
962	struct nv50_reg *one = alloc_immd(pc, 1.0);
963	struct nv50_reg *zero = alloc_immd(pc, 0.0);
964	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
965	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
966	struct nv50_reg *tmp[4];
967	boolean allow32 = pc->allow32;
968
969	pc->allow32 = FALSE;
970
971	if (mask & (3 << 1)) {
972		tmp[0] = alloc_temp(pc, NULL);
973		emit_minmax(pc, 4, tmp[0], src[0], zero);
974	}
975
976	if (mask & (1 << 2)) {
977		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
978
979		tmp[1] = temp_temp(pc);
980		emit_minmax(pc, 4, tmp[1], src[1], zero);
981
982		tmp[3] = temp_temp(pc);
983		emit_minmax(pc, 4, tmp[3], src[3], neg128);
984		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
985
986		emit_pow(pc, dst[2], tmp[1], tmp[3]);
987		emit_mov(pc, dst[2], zero);
988		set_pred(pc, 3, 0, pc->p->exec_tail);
989	}
990
991	if (mask & (1 << 1))
992		assimilate_temp(pc, dst[1], tmp[0]);
993	else
994	if (mask & (1 << 2))
995		free_temp(pc, tmp[0]);
996
997	pc->allow32 = allow32;
998
999	/* do this last, in case src[i,j] == dst[0,3] */
1000	if (mask & (1 << 0))
1001		emit_mov(pc, dst[0], one);
1002
1003	if (mask & (1 << 3))
1004		emit_mov(pc, dst[3], one);
1005
1006	FREE(pos128);
1007	FREE(neg128);
1008	FREE(zero);
1009	FREE(one);
1010}
1011
1012static void
1013emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
1014{
1015	struct nv50_program_exec *e = exec(pc);
1016
1017	set_long(pc, e);
1018	e->inst[0] |= 0xa0000000; /* delta */
1019	e->inst[1] |= (7 << 29); /* delta */
1020	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
1021	e->inst[1] |= (1 << 14); /* src .f32 */
1022	set_dst(pc, dst, e);
1023	set_src_0(pc, src, e);
1024
1025	emit(pc, e);
1026}
1027
1028static void
1029emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1030{
1031	struct nv50_program_exec *e;
1032	const int r_pred = 1;
1033
1034	/* Sets predicate reg ? */
1035	e = exec(pc);
1036	e->inst[0] = 0xa00001fd;
1037	e->inst[1] = 0xc4014788;
1038	set_src_0(pc, src, e);
1039	set_pred_wr(pc, 1, r_pred, e);
1040	if (src->neg)
1041		e->inst[1] |= 0x20000000;
1042	emit(pc, e);
1043
1044	/* This is probably KILP */
1045	e = exec(pc);
1046	e->inst[0] = 0x000001fe;
1047	set_long(pc, e);
1048	set_pred(pc, 1 /* LT? */, r_pred, e);
1049	emit(pc, e);
1050}
1051
1052static void
1053emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1054	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1055{
1056	struct nv50_reg *temp, *t[4];
1057	struct nv50_program_exec *e;
1058
1059	unsigned c, mode, dim;
1060
1061	switch (type) {
1062	case TGSI_TEXTURE_1D:
1063		dim = 1;
1064		break;
1065	case TGSI_TEXTURE_UNKNOWN:
1066	case TGSI_TEXTURE_2D:
1067	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1068	case TGSI_TEXTURE_RECT:
1069		dim = 2;
1070		break;
1071	case TGSI_TEXTURE_3D:
1072	case TGSI_TEXTURE_CUBE:
1073	case TGSI_TEXTURE_SHADOW2D:
1074	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1075		dim = 3;
1076		break;
1077	default:
1078		assert(0);
1079		break;
1080	}
1081
1082	/* some cards need t[0]'s hw index to be a multiple of 4 */
1083	alloc_temp4(pc, t, 0);
1084
1085	if (proj) {
1086		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1087			mode = pc->interp_mode[src[0]->index];
1088
1089			t[3]->rhw = src[3]->rhw;
1090			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1091			emit_flop(pc, 0, t[3], t[3]);
1092
1093			for (c = 0; c < dim; c++) {
1094				t[c]->rhw = src[c]->rhw;
1095				emit_interp(pc, t[c], t[3],
1096					    (mode | INTERP_PERSPECTIVE));
1097			}
1098		} else {
1099			emit_flop(pc, 0, t[3], src[3]);
1100			for (c = 0; c < dim; c++)
1101				emit_mul(pc, t[c], src[c], t[3]);
1102
1103			/* XXX: for some reason the blob sometimes uses MAD:
1104			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1105			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1106			 */
1107		}
1108	} else {
1109		if (type == TGSI_TEXTURE_CUBE) {
1110			temp = temp_temp(pc);
1111			emit_minmax(pc, 4, temp, src[0], src[1]);
1112			emit_minmax(pc, 4, temp, temp, src[2]);
1113			emit_flop(pc, 0, temp, temp);
1114			for (c = 0; c < 3; c++)
1115				emit_mul(pc, t[c], src[c], temp);
1116		} else {
1117			for (c = 0; c < dim; c++)
1118				emit_mov(pc, t[c], src[c]);
1119		}
1120	}
1121
1122	e = exec(pc);
1123	set_long(pc, e);
1124	e->inst[0] |= 0xf0000000;
1125	e->inst[1] |= 0x00000004;
1126	set_dst(pc, t[0], e);
1127	e->inst[0] |= (unit << 9);
1128
1129	if (dim == 2)
1130		e->inst[0] |= 0x00400000;
1131	else
1132	if (dim == 3)
1133		e->inst[0] |= 0x00800000;
1134
1135	e->inst[0] |= (mask & 0x3) << 25;
1136	e->inst[1] |= (mask & 0xc) << 12;
1137
1138	emit(pc, e);
1139
1140#if 1
1141	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1142	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1143	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1144	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1145
1146	free_temp4(pc, t);
1147#else
1148	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1149	 * the texture coordinates, not the fetched values: latency ? */
1150
1151	for (c = 0; c < 4; c++) {
1152		if (mask & (1 << c))
1153			assimilate_temp(pc, dst[c], t[c]);
1154		else
1155			free_temp(pc, t[c]);
1156	}
1157#endif
1158}
1159
1160static void
1161emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
1162	    struct nv50_program_exec **join)
1163{
1164	struct nv50_program_exec *e = exec(pc);
1165
1166	if (join) {
1167		set_long(pc, e);
1168		e->inst[0] |= 0xa0000002;
1169		emit(pc, e);
1170		*join = e;
1171		e = exec(pc);
1172	}
1173
1174	set_long(pc, e);
1175	e->inst[0] |= 0x10000002;
1176	if (pred >= 0)
1177		set_pred(pc, cc, pred, e);
1178	emit(pc, e);
1179}
1180
1181static void
1182emit_nop(struct nv50_pc *pc)
1183{
1184	struct nv50_program_exec *e = exec(pc);
1185
1186	e->inst[0] = 0xf0000000;
1187	set_long(pc, e);
1188	e->inst[1] = 0xe0000000;
1189	emit(pc, e);
1190}
1191
1192static void
1193convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1194{
1195	unsigned q = 0, m = ~0;
1196
1197	assert(!is_long(e));
1198
1199	switch (e->inst[0] >> 28) {
1200	case 0x1:
1201		/* MOV */
1202		q = 0x0403c000;
1203		m = 0xffff7fff;
1204		break;
1205	case 0x8:
1206		/* INTERP (move centroid, perspective and flat bits) */
1207		m = ~0x03000100;
1208		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1209		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1210		break;
1211	case 0x9:
1212		/* RCP */
1213		break;
1214	case 0xB:
1215		/* ADD */
1216		m = ~(127 << 16);
1217		q = ((e->inst[0] & (~m)) >> 2);
1218		break;
1219	case 0xC:
1220		/* MUL */
1221		m = ~0x00008000;
1222		q = ((e->inst[0] & (~m)) << 12);
1223		break;
1224	case 0xE:
1225		/* MAD (if src2 == dst) */
1226		q = ((e->inst[0] & 0x1fc) << 12);
1227		break;
1228	default:
1229		assert(0);
1230		break;
1231	}
1232
1233	set_long(pc, e);
1234	pc->p->exec_size++;
1235
1236	e->inst[0] &= m;
1237	e->inst[1] |= q;
1238}
1239
1240static boolean
1241negate_supported(const struct tgsi_full_instruction *insn, int i)
1242{
1243	switch (insn->Instruction.Opcode) {
1244	case TGSI_OPCODE_DP3:
1245	case TGSI_OPCODE_DP4:
1246	case TGSI_OPCODE_MUL:
1247	case TGSI_OPCODE_KIL:
1248	case TGSI_OPCODE_ADD:
1249	case TGSI_OPCODE_SUB:
1250	case TGSI_OPCODE_MAD:
1251		return TRUE;
1252	case TGSI_OPCODE_POW:
1253		return (i == 1) ? TRUE : FALSE;
1254	default:
1255		return FALSE;
1256	}
1257}
1258
1259/* Return a read mask for source registers deduced from opcode & write mask. */
1260static unsigned
1261nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1262{
1263	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1264
1265	switch (insn->Instruction.Opcode) {
1266	case TGSI_OPCODE_COS:
1267	case TGSI_OPCODE_SIN:
1268		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1269	case TGSI_OPCODE_DP3:
1270		return 0x7;
1271	case TGSI_OPCODE_DP4:
1272	case TGSI_OPCODE_DPH:
1273	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1274		return 0xf;
1275	case TGSI_OPCODE_DST:
1276		return mask & (c ? 0xa : 0x6);
1277	case TGSI_OPCODE_EX2:
1278	case TGSI_OPCODE_LG2:
1279	case TGSI_OPCODE_POW:
1280	case TGSI_OPCODE_RCP:
1281	case TGSI_OPCODE_RSQ:
1282	case TGSI_OPCODE_SCS:
1283		return 0x1;
1284	case TGSI_OPCODE_LIT:
1285		return 0xb;
1286	case TGSI_OPCODE_TEX:
1287	case TGSI_OPCODE_TXP:
1288	{
1289		const struct tgsi_instruction_ext_texture *tex;
1290
1291		assert(insn->Instruction.Extended);
1292		tex = &insn->InstructionExtTexture;
1293
1294		mask = 0x7;
1295		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1296			mask |= 0x8;
1297
1298		switch (tex->Texture) {
1299		case TGSI_TEXTURE_1D:
1300			mask &= 0x9;
1301			break;
1302		case TGSI_TEXTURE_2D:
1303			mask &= 0xb;
1304			break;
1305		default:
1306			break;
1307		}
1308	}
1309		return mask;
1310	case TGSI_OPCODE_XPD:
1311		x = 0;
1312		if (mask & 1) x |= 0x6;
1313		if (mask & 2) x |= 0x5;
1314		if (mask & 4) x |= 0x3;
1315		return x;
1316	default:
1317		break;
1318	}
1319
1320	return mask;
1321}
1322
1323static struct nv50_reg *
1324tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1325{
1326	switch (dst->DstRegister.File) {
1327	case TGSI_FILE_TEMPORARY:
1328		return &pc->temp[dst->DstRegister.Index * 4 + c];
1329	case TGSI_FILE_OUTPUT:
1330		return &pc->result[dst->DstRegister.Index * 4 + c];
1331	case TGSI_FILE_NULL:
1332		return NULL;
1333	default:
1334		break;
1335	}
1336
1337	return NULL;
1338}
1339
1340static struct nv50_reg *
1341tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1342	 boolean neg)
1343{
1344	struct nv50_reg *r = NULL;
1345	struct nv50_reg *temp;
1346	unsigned sgn, c;
1347
1348	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1349
1350	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1351	switch (c) {
1352	case TGSI_EXTSWIZZLE_X:
1353	case TGSI_EXTSWIZZLE_Y:
1354	case TGSI_EXTSWIZZLE_Z:
1355	case TGSI_EXTSWIZZLE_W:
1356		switch (src->SrcRegister.File) {
1357		case TGSI_FILE_INPUT:
1358			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1359			break;
1360		case TGSI_FILE_TEMPORARY:
1361			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1362			break;
1363		case TGSI_FILE_CONSTANT:
1364			r = &pc->param[src->SrcRegister.Index * 4 + c];
1365			break;
1366		case TGSI_FILE_IMMEDIATE:
1367			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1368			break;
1369		case TGSI_FILE_SAMPLER:
1370			break;
1371		default:
1372			assert(0);
1373			break;
1374		}
1375		break;
1376	case TGSI_EXTSWIZZLE_ZERO:
1377		r = alloc_immd(pc, 0.0);
1378		return r;
1379	case TGSI_EXTSWIZZLE_ONE:
1380		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1381			return alloc_immd(pc, -1.0);
1382		return alloc_immd(pc, 1.0);
1383	default:
1384		assert(0);
1385		break;
1386	}
1387
1388	switch (sgn) {
1389	case TGSI_UTIL_SIGN_KEEP:
1390		break;
1391	case TGSI_UTIL_SIGN_CLEAR:
1392		temp = temp_temp(pc);
1393		emit_abs(pc, temp, r);
1394		r = temp;
1395		break;
1396	case TGSI_UTIL_SIGN_TOGGLE:
1397		if (neg)
1398			r->neg = 1;
1399		else {
1400			temp = temp_temp(pc);
1401			emit_neg(pc, temp, r);
1402			r = temp;
1403		}
1404		break;
1405	case TGSI_UTIL_SIGN_SET:
1406		temp = temp_temp(pc);
1407		emit_abs(pc, temp, r);
1408		if (neg)
1409			temp->neg = 1;
1410		else
1411			emit_neg(pc, temp, temp);
1412		r = temp;
1413		break;
1414	default:
1415		assert(0);
1416		break;
1417	}
1418
1419	return r;
1420}
1421
1422/* return TRUE for ops that produce only a single result */
1423static boolean
1424is_scalar_op(unsigned op)
1425{
1426	switch (op) {
1427	case TGSI_OPCODE_COS:
1428	case TGSI_OPCODE_DP2:
1429	case TGSI_OPCODE_DP3:
1430	case TGSI_OPCODE_DP4:
1431	case TGSI_OPCODE_DPH:
1432	case TGSI_OPCODE_EX2:
1433	case TGSI_OPCODE_LG2:
1434	case TGSI_OPCODE_POW:
1435	case TGSI_OPCODE_RCP:
1436	case TGSI_OPCODE_RSQ:
1437	case TGSI_OPCODE_SIN:
1438		/*
1439	case TGSI_OPCODE_KIL:
1440	case TGSI_OPCODE_LIT:
1441	case TGSI_OPCODE_SCS:
1442		*/
1443		return TRUE;
1444	default:
1445		return FALSE;
1446	}
1447}
1448
1449/* Returns a bitmask indicating which dst components depend
1450 * on source s, component c (reverse of nv50_tgsi_src_mask).
1451 */
1452static unsigned
1453nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1454{
1455	if (is_scalar_op(op))
1456		return 0x1;
1457
1458	switch (op) {
1459	case TGSI_OPCODE_DST:
1460		return (1 << c) & (s ? 0xa : 0x6);
1461	case TGSI_OPCODE_XPD:
1462		switch (c) {
1463		case 0: return 0x6;
1464		case 1: return 0x5;
1465		case 2: return 0x3;
1466		case 3: return 0x0;
1467		default:
1468			assert(0);
1469			return 0x0;
1470		}
1471	case TGSI_OPCODE_LIT:
1472	case TGSI_OPCODE_SCS:
1473	case TGSI_OPCODE_TEX:
1474	case TGSI_OPCODE_TXP:
1475		/* these take care of dangerous swizzles themselves */
1476		return 0x0;
1477	case TGSI_OPCODE_IF:
1478	case TGSI_OPCODE_KIL:
1479		/* don't call this function for these ops */
1480		assert(0);
1481		return 0;
1482	default:
1483		/* linear vector instruction */
1484		return (1 << c);
1485	}
1486}
1487
1488static boolean
1489nv50_program_tx_insn(struct nv50_pc *pc,
1490		     const struct tgsi_full_instruction *inst)
1491{
1492	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1493	unsigned mask, sat, unit;
1494	int i, c;
1495
1496	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1497	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1498
1499	memset(src, 0, sizeof(src));
1500
1501	for (c = 0; c < 4; c++) {
1502		if ((mask & (1 << c)) && !pc->r_dst[c])
1503			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1504		else
1505			dst[c] = pc->r_dst[c];
1506		rdst[c] = dst[c];
1507	}
1508
1509	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1510		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1511		unsigned src_mask;
1512		boolean neg_supp;
1513
1514		src_mask = nv50_tgsi_src_mask(inst, i);
1515		neg_supp = negate_supported(inst, i);
1516
1517		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1518			unit = fs->SrcRegister.Index;
1519
1520		for (c = 0; c < 4; c++)
1521			if (src_mask & (1 << c))
1522				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1523	}
1524
1525	brdc = temp = pc->r_brdc;
1526	if (brdc && brdc->type != P_TEMP) {
1527		temp = temp_temp(pc);
1528		if (sat)
1529			brdc = temp;
1530	} else
1531	if (sat) {
1532		for (c = 0; c < 4; c++) {
1533			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1534				continue;
1535			rdst[c] = dst[c];
1536			dst[c] = temp_temp(pc);
1537		}
1538	}
1539
1540	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1541
1542	switch (inst->Instruction.Opcode) {
1543	case TGSI_OPCODE_ABS:
1544		for (c = 0; c < 4; c++) {
1545			if (!(mask & (1 << c)))
1546				continue;
1547			emit_abs(pc, dst[c], src[0][c]);
1548		}
1549		break;
1550	case TGSI_OPCODE_ADD:
1551		for (c = 0; c < 4; c++) {
1552			if (!(mask & (1 << c)))
1553				continue;
1554			emit_add(pc, dst[c], src[0][c], src[1][c]);
1555		}
1556		break;
1557	case TGSI_OPCODE_CEIL:
1558		for (c = 0; c < 4; c++) {
1559			if (!(mask & (1 << c)))
1560				continue;
1561			emit_cvt(pc, dst[c], src[0][c], -1,
1562				 CVTOP_CEIL, CVT_F32_F32);
1563		}
1564		break;
1565	case TGSI_OPCODE_COS:
1566		if (mask & 8) {
1567			emit_precossin(pc, temp, src[0][3]);
1568			emit_flop(pc, 5, dst[3], temp);
1569			if (!(mask &= 7))
1570				break;
1571			if (temp == dst[3])
1572				temp = brdc = temp_temp(pc);
1573		}
1574		emit_precossin(pc, temp, src[0][0]);
1575		emit_flop(pc, 5, brdc, temp);
1576		break;
1577	case TGSI_OPCODE_DP3:
1578		emit_mul(pc, temp, src[0][0], src[1][0]);
1579		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1580		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1581		break;
1582	case TGSI_OPCODE_DP4:
1583		emit_mul(pc, temp, src[0][0], src[1][0]);
1584		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1585		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1586		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1587		break;
1588	case TGSI_OPCODE_DPH:
1589		emit_mul(pc, temp, src[0][0], src[1][0]);
1590		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1591		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1592		emit_add(pc, brdc, src[1][3], temp);
1593		break;
1594	case TGSI_OPCODE_DST:
1595		if (mask & (1 << 1))
1596			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1597		if (mask & (1 << 2))
1598			emit_mov(pc, dst[2], src[0][2]);
1599		if (mask & (1 << 3))
1600			emit_mov(pc, dst[3], src[1][3]);
1601		if (mask & (1 << 0))
1602			emit_mov_immdval(pc, dst[0], 1.0f);
1603		break;
1604	case TGSI_OPCODE_ELSE:
1605		emit_branch(pc, -1, 0, NULL);
1606		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1607		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1608		break;
1609	case TGSI_OPCODE_ENDIF:
1610		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
1611
1612		if (pc->br_join[pc->if_lvl]) {
1613			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
1614			pc->br_join[pc->if_lvl] = NULL;
1615		}
1616		/* emit a NOP as join point, we could set it on the next
1617		 * one, but would have to make sure it is long and !immd
1618		 */
1619		emit_nop(pc);
1620		pc->p->exec_tail->inst[1] |= 2;
1621		break;
1622	case TGSI_OPCODE_EX2:
1623		emit_preex2(pc, temp, src[0][0]);
1624		emit_flop(pc, 6, brdc, temp);
1625		break;
1626	case TGSI_OPCODE_FLR:
1627		for (c = 0; c < 4; c++) {
1628			if (!(mask & (1 << c)))
1629				continue;
1630			emit_flr(pc, dst[c], src[0][c]);
1631		}
1632		break;
1633	case TGSI_OPCODE_FRC:
1634		temp = temp_temp(pc);
1635		for (c = 0; c < 4; c++) {
1636			if (!(mask & (1 << c)))
1637				continue;
1638			emit_flr(pc, temp, src[0][c]);
1639			emit_sub(pc, dst[c], src[0][c], temp);
1640		}
1641		break;
1642	case TGSI_OPCODE_IF:
1643		/* emitting a join_at may not be necessary */
1644		assert(pc->if_lvl < MAX_IF_DEPTH);
1645		set_pred_wr(pc, 1, 0, pc->if_cond);
1646		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
1647		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
1648		break;
1649	case TGSI_OPCODE_KIL:
1650		emit_kil(pc, src[0][0]);
1651		emit_kil(pc, src[0][1]);
1652		emit_kil(pc, src[0][2]);
1653		emit_kil(pc, src[0][3]);
1654		break;
1655	case TGSI_OPCODE_LIT:
1656		emit_lit(pc, &dst[0], mask, &src[0][0]);
1657		break;
1658	case TGSI_OPCODE_LG2:
1659		emit_flop(pc, 3, brdc, src[0][0]);
1660		break;
1661	case TGSI_OPCODE_LRP:
1662		temp = temp_temp(pc);
1663		for (c = 0; c < 4; c++) {
1664			if (!(mask & (1 << c)))
1665				continue;
1666			emit_sub(pc, temp, src[1][c], src[2][c]);
1667			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1668		}
1669		break;
1670	case TGSI_OPCODE_MAD:
1671		for (c = 0; c < 4; c++) {
1672			if (!(mask & (1 << c)))
1673				continue;
1674			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1675		}
1676		break;
1677	case TGSI_OPCODE_MAX:
1678		for (c = 0; c < 4; c++) {
1679			if (!(mask & (1 << c)))
1680				continue;
1681			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1682		}
1683		break;
1684	case TGSI_OPCODE_MIN:
1685		for (c = 0; c < 4; c++) {
1686			if (!(mask & (1 << c)))
1687				continue;
1688			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1689		}
1690		break;
1691	case TGSI_OPCODE_MOV:
1692	case TGSI_OPCODE_SWZ:
1693		for (c = 0; c < 4; c++) {
1694			if (!(mask & (1 << c)))
1695				continue;
1696			emit_mov(pc, dst[c], src[0][c]);
1697		}
1698		break;
1699	case TGSI_OPCODE_MUL:
1700		for (c = 0; c < 4; c++) {
1701			if (!(mask & (1 << c)))
1702				continue;
1703			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1704		}
1705		break;
1706	case TGSI_OPCODE_POW:
1707		emit_pow(pc, brdc, src[0][0], src[1][0]);
1708		break;
1709	case TGSI_OPCODE_RCP:
1710		emit_flop(pc, 0, brdc, src[0][0]);
1711		break;
1712	case TGSI_OPCODE_RSQ:
1713		emit_flop(pc, 2, brdc, src[0][0]);
1714		break;
1715	case TGSI_OPCODE_SCS:
1716		temp = temp_temp(pc);
1717		if (mask & 3)
1718			emit_precossin(pc, temp, src[0][0]);
1719		if (mask & (1 << 0))
1720			emit_flop(pc, 5, dst[0], temp);
1721		if (mask & (1 << 1))
1722			emit_flop(pc, 4, dst[1], temp);
1723		if (mask & (1 << 2))
1724			emit_mov_immdval(pc, dst[2], 0.0);
1725		if (mask & (1 << 3))
1726			emit_mov_immdval(pc, dst[3], 1.0);
1727		break;
1728	case TGSI_OPCODE_SIN:
1729		if (mask & 8) {
1730			emit_precossin(pc, temp, src[0][3]);
1731			emit_flop(pc, 4, dst[3], temp);
1732			if (!(mask &= 7))
1733				break;
1734			if (temp == dst[3])
1735				temp = brdc = temp_temp(pc);
1736		}
1737		emit_precossin(pc, temp, src[0][0]);
1738		emit_flop(pc, 4, brdc, temp);
1739		break;
1740	case TGSI_OPCODE_SLT:
1741	case TGSI_OPCODE_SGE:
1742	case TGSI_OPCODE_SEQ:
1743	case TGSI_OPCODE_SGT:
1744	case TGSI_OPCODE_SLE:
1745	case TGSI_OPCODE_SNE:
1746		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1747		for (c = 0; c < 4; c++) {
1748			if (!(mask & (1 << c)))
1749				continue;
1750			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1751		}
1752		break;
1753	case TGSI_OPCODE_SUB:
1754		for (c = 0; c < 4; c++) {
1755			if (!(mask & (1 << c)))
1756				continue;
1757			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1758		}
1759		break;
1760	case TGSI_OPCODE_TEX:
1761		emit_tex(pc, dst, mask, src[0], unit,
1762			 inst->InstructionExtTexture.Texture, FALSE);
1763		break;
1764	case TGSI_OPCODE_TXP:
1765		emit_tex(pc, dst, mask, src[0], unit,
1766			 inst->InstructionExtTexture.Texture, TRUE);
1767		break;
1768	case TGSI_OPCODE_TRUNC:
1769		for (c = 0; c < 4; c++) {
1770			if (!(mask & (1 << c)))
1771				continue;
1772			emit_cvt(pc, dst[c], src[0][c], -1,
1773				 CVTOP_TRUNC, CVT_F32_F32);
1774		}
1775		break;
1776	case TGSI_OPCODE_XPD:
1777		temp = temp_temp(pc);
1778		if (mask & (1 << 0)) {
1779			emit_mul(pc, temp, src[0][2], src[1][1]);
1780			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1781		}
1782		if (mask & (1 << 1)) {
1783			emit_mul(pc, temp, src[0][0], src[1][2]);
1784			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1785		}
1786		if (mask & (1 << 2)) {
1787			emit_mul(pc, temp, src[0][1], src[1][0]);
1788			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1789		}
1790		if (mask & (1 << 3))
1791			emit_mov_immdval(pc, dst[3], 1.0);
1792		break;
1793	case TGSI_OPCODE_END:
1794		break;
1795	default:
1796		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1797		return FALSE;
1798	}
1799
1800	if (brdc) {
1801		if (sat)
1802			emit_sat(pc, brdc, brdc);
1803		for (c = 0; c < 4; c++)
1804			if ((mask & (1 << c)) && dst[c] != brdc)
1805				emit_mov(pc, dst[c], brdc);
1806	} else
1807	if (sat) {
1808		for (c = 0; c < 4; c++) {
1809			if (!(mask & (1 << c)))
1810				continue;
1811			/* in this case we saturate later */
1812			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1813				continue;
1814			emit_sat(pc, rdst[c], dst[c]);
1815		}
1816	}
1817
1818	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1819		for (c = 0; c < 4; c++) {
1820			if (!src[i][c])
1821				continue;
1822			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1823				FREE(src[i][c]);
1824		}
1825	}
1826
1827	kill_temp_temp(pc);
1828	return TRUE;
1829}
1830
1831static void
1832prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
1833{
1834	struct nv50_reg *reg = NULL;
1835	const struct tgsi_full_src_register *src;
1836	const struct tgsi_dst_register *dst;
1837	unsigned i, c, k, mask;
1838
1839	dst = &insn->FullDstRegisters[0].DstRegister;
1840	mask = dst->WriteMask;
1841
1842        if (dst->File == TGSI_FILE_TEMPORARY)
1843                reg = pc->temp;
1844        else
1845        if (dst->File == TGSI_FILE_OUTPUT)
1846                reg = pc->result;
1847
1848	if (reg) {
1849		for (c = 0; c < 4; c++) {
1850			if (!(mask & (1 << c)))
1851				continue;
1852			reg[dst->Index * 4 + c].acc = pc->insn_nr;
1853		}
1854	}
1855
1856	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1857		src = &insn->FullSrcRegisters[i];
1858
1859		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
1860			reg = pc->temp;
1861		else
1862		if (src->SrcRegister.File == TGSI_FILE_INPUT)
1863			reg = pc->attr;
1864		else
1865			continue;
1866
1867		mask = nv50_tgsi_src_mask(insn, i);
1868
1869		for (c = 0; c < 4; c++) {
1870			if (!(mask & (1 << c)))
1871				continue;
1872			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1873
1874			if (k > TGSI_EXTSWIZZLE_W)
1875				continue;
1876
1877			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
1878		}
1879	}
1880}
1881
1882/* Returns a bitmask indicating which dst components need to be
1883 * written to temporaries first to avoid 'corrupting' sources.
1884 *
1885 * m[i]   (out) indicate component to write in the i-th position
1886 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1887 */
1888static unsigned
1889nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1890{
1891	unsigned i, c, x, unsafe;
1892
1893	for (c = 0; c < 4; c++)
1894		m[c] = c;
1895
1896	/* Swap as long as a dst component written earlier is depended on
1897	 * by one written later, but the next one isn't depended on by it.
1898	 */
1899	for (c = 0; c < 3; c++) {
1900		if (rdep[m[c + 1]] & (1 << m[c]))
1901			continue; /* if next one is depended on by us */
1902		for (i = c + 1; i < 4; i++)
1903			/* if we are depended on by a later one */
1904			if (rdep[m[c]] & (1 << m[i]))
1905				break;
1906		if (i == 4)
1907			continue;
1908		/* now, swap */
1909		x = m[c];
1910		m[c] = m[c + 1];
1911		m[c + 1] = x;
1912
1913		/* restart */
1914		c = 0;
1915	}
1916
1917	/* mark dependencies that could not be resolved by reordering */
1918	for (i = 0; i < 3; ++i)
1919		for (c = i + 1; c < 4; ++c)
1920			if (rdep[m[i]] & (1 << m[c]))
1921				unsafe |= (1 << i);
1922
1923	/* NOTE: $unsafe is with respect to order, not component */
1924	return unsafe;
1925}
1926
1927/* Select a suitable dst register for broadcasting scalar results,
1928 * or return NULL if we have to allocate an extra TEMP.
1929 *
1930 * If e.g. only 1 component is written, we may also emit the final
1931 * result to a write-only register.
1932 */
1933static struct nv50_reg *
1934tgsi_broadcast_dst(struct nv50_pc *pc,
1935		   const struct tgsi_full_dst_register *fd, unsigned mask)
1936{
1937	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1938		int c = ffs(~mask & fd->DstRegister.WriteMask);
1939		if (c)
1940			return tgsi_dst(pc, c - 1, fd);
1941	} else {
1942		int c = ffs(fd->DstRegister.WriteMask) - 1;
1943		if ((1 << c) == fd->DstRegister.WriteMask)
1944			return tgsi_dst(pc, c, fd);
1945	}
1946
1947	return NULL;
1948}
1949
1950/* Scan source swizzles and return a bitmask indicating dst regs that
1951 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1952 */
1953static unsigned
1954nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1955		       unsigned rdep[4])
1956{
1957	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1958	const struct tgsi_full_src_register *fs;
1959	unsigned i, deqs = 0;
1960
1961	for (i = 0; i < 4; ++i)
1962		rdep[i] = 0;
1963
1964	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1965		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1966		boolean neg_supp = negate_supported(insn, i);
1967
1968		fs = &insn->FullSrcRegisters[i];
1969		if (fs->SrcRegister.File != fd->DstRegister.File ||
1970		    fs->SrcRegister.Index != fd->DstRegister.Index)
1971			continue;
1972
1973		for (chn = 0; chn < 4; ++chn) {
1974			unsigned s, c;
1975
1976			if (!(mask & (1 << chn))) /* src is not read */
1977				continue;
1978			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1979			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1980
1981			if (c > TGSI_EXTSWIZZLE_W ||
1982			    !(fd->DstRegister.WriteMask & (1 << c)))
1983				continue;
1984
1985			/* no danger if src is copied to TEMP first */
1986			if ((s != TGSI_UTIL_SIGN_KEEP) &&
1987			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1988				continue;
1989
1990			rdep[c] |= nv50_tgsi_dst_revdep(
1991				insn->Instruction.Opcode, i, chn);
1992			deqs |= (1 << c);
1993		}
1994	}
1995
1996	return deqs;
1997}
1998
1999static boolean
2000nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
2001{
2002	struct tgsi_full_instruction insn = tok->FullInstruction;
2003	const struct tgsi_full_dst_register *fd;
2004	unsigned i, deqs, rdep[4], m[4];
2005
2006	fd = &tok->FullInstruction.FullDstRegisters[0];
2007	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
2008
2009	if (is_scalar_op(insn.Instruction.Opcode)) {
2010		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
2011		if (!pc->r_brdc)
2012			pc->r_brdc = temp_temp(pc);
2013		return nv50_program_tx_insn(pc, &insn);
2014	}
2015	pc->r_brdc = NULL;
2016
2017	if (!deqs)
2018		return nv50_program_tx_insn(pc, &insn);
2019
2020	deqs = nv50_revdep_reorder(m, rdep);
2021
2022	for (i = 0; i < 4; ++i) {
2023		assert(pc->r_dst[m[i]] == NULL);
2024
2025		insn.FullDstRegisters[0].DstRegister.WriteMask =
2026			fd->DstRegister.WriteMask & (1 << m[i]);
2027
2028		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
2029			continue;
2030
2031		if (deqs & (1 << i))
2032			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
2033
2034		if (!nv50_program_tx_insn(pc, &insn))
2035			return FALSE;
2036	}
2037
2038	for (i = 0; i < 4; i++) {
2039		struct nv50_reg *reg = pc->r_dst[i];
2040		if (!reg)
2041			continue;
2042		pc->r_dst[i] = NULL;
2043
2044		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
2045			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
2046		else
2047			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
2048		free_temp(pc, reg);
2049	}
2050
2051	return TRUE;
2052}
2053
2054static void
2055load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
2056{
2057	struct nv50_reg *iv, **ppiv;
2058	unsigned mode = pc->interp_mode[reg->index];
2059
2060	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
2061	iv = *ppiv;
2062
2063	if ((mode & INTERP_PERSPECTIVE) && !iv) {
2064		iv = *ppiv = alloc_temp(pc, NULL);
2065		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
2066
2067		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
2068		emit_flop(pc, 0, iv, iv);
2069
2070		/* XXX: when loading interpolants dynamically, move these
2071		 * to the program head, or make sure it can't be skipped.
2072		 */
2073	}
2074
2075	emit_interp(pc, reg, iv, mode);
2076}
2077
2078static boolean
2079nv50_program_tx_prep(struct nv50_pc *pc)
2080{
2081	struct tgsi_parse_context tp;
2082	struct nv50_program *p = pc->p;
2083	boolean ret = FALSE;
2084	unsigned i, c, flat_nr = 0;
2085
2086	tgsi_parse_init(&tp, pc->p->pipe.tokens);
2087	while (!tgsi_parse_end_of_tokens(&tp)) {
2088		const union tgsi_full_token *tok = &tp.FullToken;
2089
2090		tgsi_parse_token(&tp);
2091		switch (tok->Token.Type) {
2092		case TGSI_TOKEN_TYPE_IMMEDIATE:
2093		{
2094			const struct tgsi_full_immediate *imm =
2095				&tp.FullToken.FullImmediate;
2096
2097			ctor_immd(pc, imm->u[0].Float,
2098				      imm->u[1].Float,
2099				      imm->u[2].Float,
2100				      imm->u[3].Float);
2101		}
2102			break;
2103		case TGSI_TOKEN_TYPE_DECLARATION:
2104		{
2105			const struct tgsi_full_declaration *d;
2106			unsigned si, last, first, mode;
2107
2108			d = &tp.FullToken.FullDeclaration;
2109			first = d->DeclarationRange.First;
2110			last = d->DeclarationRange.Last;
2111
2112			switch (d->Declaration.File) {
2113			case TGSI_FILE_TEMPORARY:
2114				break;
2115			case TGSI_FILE_OUTPUT:
2116				if (!d->Declaration.Semantic ||
2117				    p->type == PIPE_SHADER_FRAGMENT)
2118					break;
2119
2120				si = d->Semantic.SemanticIndex;
2121				switch (d->Semantic.SemanticName) {
2122				case TGSI_SEMANTIC_BCOLOR:
2123					p->cfg.two_side[si].hw = first;
2124					if (p->cfg.io_nr > first)
2125						p->cfg.io_nr = first;
2126					break;
2127				case TGSI_SEMANTIC_PSIZE:
2128					p->cfg.psiz = first;
2129					if (p->cfg.io_nr > first)
2130						p->cfg.io_nr = first;
2131					break;
2132					/*
2133				case TGSI_SEMANTIC_CLIP_DISTANCE:
2134					p->cfg.clpd = MIN2(p->cfg.clpd, first);
2135					break;
2136					*/
2137				default:
2138					break;
2139				}
2140				break;
2141			case TGSI_FILE_INPUT:
2142			{
2143				if (p->type != PIPE_SHADER_FRAGMENT)
2144					break;
2145
2146				switch (d->Declaration.Interpolate) {
2147				case TGSI_INTERPOLATE_CONSTANT:
2148					mode = INTERP_FLAT;
2149					flat_nr++;
2150					break;
2151				case TGSI_INTERPOLATE_PERSPECTIVE:
2152					mode = INTERP_PERSPECTIVE;
2153					p->cfg.regs[1] |= 0x08 << 24;
2154					break;
2155				default:
2156					mode = INTERP_LINEAR;
2157					break;
2158				}
2159				if (d->Declaration.Centroid)
2160					mode |= INTERP_CENTROID;
2161
2162				assert(last < 32);
2163				for (i = first; i <= last; i++)
2164					pc->interp_mode[i] = mode;
2165			}
2166				break;
2167			case TGSI_FILE_CONSTANT:
2168				break;
2169			case TGSI_FILE_SAMPLER:
2170				break;
2171			default:
2172				NOUVEAU_ERR("bad decl file %d\n",
2173					    d->Declaration.File);
2174				goto out_err;
2175			}
2176		}
2177			break;
2178		case TGSI_TOKEN_TYPE_INSTRUCTION:
2179			pc->insn_nr++;
2180			prep_inspect_insn(pc, &tok->FullInstruction);
2181			break;
2182		default:
2183			break;
2184		}
2185	}
2186
2187	if (p->type == PIPE_SHADER_VERTEX) {
2188		int rid = 0;
2189
2190		for (i = 0; i < pc->attr_nr * 4; ++i) {
2191			if (pc->attr[i].acc) {
2192				pc->attr[i].hw = rid++;
2193				p->cfg.attr[i / 32] |= 1 << (i % 32);
2194			}
2195		}
2196
2197		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
2198			p->cfg.io[i].hw = rid;
2199			p->cfg.io[i].id_vp = i;
2200
2201			for (c = 0; c < 4; ++c) {
2202				int n = i * 4 + c;
2203				if (!pc->result[n].acc)
2204					continue;
2205				pc->result[n].hw = rid++;
2206				p->cfg.io[i].mask |= 1 << c;
2207			}
2208		}
2209
2210		for (c = 0; c < 2; ++c)
2211			if (p->cfg.two_side[c].hw < 0x40)
2212				p->cfg.two_side[c] = p->cfg.io[
2213					p->cfg.two_side[c].hw];
2214
2215		if (p->cfg.psiz < 0x40)
2216			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
2217	} else
2218	if (p->type == PIPE_SHADER_FRAGMENT) {
2219		int rid, aid;
2220		unsigned n = 0, m = pc->attr_nr - flat_nr;
2221
2222		int base = (TGSI_SEMANTIC_POSITION ==
2223			    p->info.input_semantic_name[0]) ? 0 : 1;
2224
2225		/* non-flat interpolants have to be mapped to
2226		 * the lower hardware IDs, so sort them:
2227		 */
2228		for (i = 0; i < pc->attr_nr; i++) {
2229			if (pc->interp_mode[i] == INTERP_FLAT) {
2230				p->cfg.io[m].id_vp = i + base;
2231				p->cfg.io[m++].id_fp = i;
2232			} else {
2233				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
2234					p->cfg.io[n].linear = TRUE;
2235				p->cfg.io[n].id_vp = i + base;
2236				p->cfg.io[n++].id_fp = i;
2237			}
2238		}
2239
2240		if (!base) /* set w-coordinate mask from perspective interp */
2241			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
2242
2243		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
2244			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
2245
2246		for (n = 0; n < pc->attr_nr; ++n) {
2247			p->cfg.io[n].hw = rid = aid;
2248			i = p->cfg.io[n].id_fp;
2249
2250			for (c = 0; c < 4; ++c) {
2251				if (!pc->attr[i * 4 + c].acc)
2252					continue;
2253				pc->attr[i * 4 + c].rhw = rid++;
2254				p->cfg.io[n].mask |= 1 << c;
2255
2256				load_interpolant(pc, &pc->attr[i * 4 + c]);
2257			}
2258			aid += popcnt4(p->cfg.io[n].mask);
2259		}
2260
2261		if (!base)
2262			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
2263
2264		m = popcnt4(p->cfg.regs[1] >> 24);
2265
2266		/* set count of non-position inputs and of non-flat
2267		 * non-position inputs for FP_INTERPOLANT_CTRL
2268		 */
2269		p->cfg.regs[1] |= aid - m;
2270
2271		if (flat_nr) {
2272			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
2273			p->cfg.regs[1] |= (i - m) << 16;
2274		} else
2275			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
2276
2277		/* mark color semantic for light-twoside */
2278		n = 0x40;
2279		for (i = 0; i < pc->attr_nr; i++) {
2280			ubyte si, sn;
2281
2282			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
2283			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
2284
2285			if (sn == TGSI_SEMANTIC_COLOR) {
2286				p->cfg.two_side[si] = p->cfg.io[i];
2287
2288				/* increase colour count */
2289				p->cfg.regs[0] += popcnt4(
2290					p->cfg.two_side[si].mask) << 16;
2291
2292				n = MIN2(n, p->cfg.io[i].hw - m);
2293			}
2294		}
2295		if (n < 0x40)
2296			p->cfg.regs[0] += n;
2297
2298		/* Initialize FP results:
2299		 * FragDepth is always first TGSI and last hw output
2300		 */
2301		i = p->info.writes_z ? 4 : 0;
2302		for (rid = 0; i < pc->result_nr * 4; i++)
2303			pc->result[i].rhw = rid++;
2304		if (p->info.writes_z)
2305			pc->result[2].rhw = rid;
2306
2307		p->cfg.high_result = rid;
2308	}
2309
2310	if (pc->immd_nr) {
2311		int rid = 0;
2312
2313		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
2314		if (!pc->immd)
2315			goto out_err;
2316
2317		for (i = 0; i < pc->immd_nr; i++) {
2318			for (c = 0; c < 4; c++, rid++)
2319				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
2320		}
2321	}
2322
2323	ret = TRUE;
2324out_err:
2325	if (pc->iv_p)
2326		free_temp(pc, pc->iv_p);
2327	if (pc->iv_c)
2328		free_temp(pc, pc->iv_c);
2329
2330	tgsi_parse_free(&tp);
2331	return ret;
2332}
2333
2334static void
2335free_nv50_pc(struct nv50_pc *pc)
2336{
2337	if (pc->immd)
2338		FREE(pc->immd);
2339	if (pc->param)
2340		FREE(pc->param);
2341	if (pc->result)
2342		FREE(pc->result);
2343	if (pc->attr)
2344		FREE(pc->attr);
2345	if (pc->temp)
2346		FREE(pc->temp);
2347
2348	FREE(pc);
2349}
2350
2351static boolean
2352ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
2353{
2354	int i, c;
2355	unsigned rtype[2] = { P_ATTR, P_RESULT };
2356
2357	pc->p = p;
2358	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
2359	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
2360	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
2361	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
2362
2363	p->cfg.high_temp = 4;
2364
2365	p->cfg.two_side[0].hw = 0x40;
2366	p->cfg.two_side[1].hw = 0x40;
2367
2368	switch (p->type) {
2369	case PIPE_SHADER_VERTEX:
2370		p->cfg.psiz = 0x40;
2371		p->cfg.clpd = 0x40;
2372		p->cfg.io_nr = pc->result_nr;
2373		break;
2374	case PIPE_SHADER_FRAGMENT:
2375		rtype[0] = rtype[1] = P_TEMP;
2376
2377		p->cfg.regs[0] = 0x01000004;
2378		p->cfg.io_nr = pc->attr_nr;
2379
2380		if (p->info.writes_z) {
2381			p->cfg.regs[2] |= 0x00000100;
2382			p->cfg.regs[3] |= 0x00000011;
2383		}
2384		if (p->info.uses_kill)
2385			p->cfg.regs[2] |= 0x00100000;
2386		break;
2387	}
2388
2389	if (pc->temp_nr) {
2390		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
2391		if (!pc->temp)
2392			return FALSE;
2393
2394		for (i = 0; i < pc->temp_nr * 4; ++i)
2395			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
2396	}
2397
2398	if (pc->attr_nr) {
2399		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
2400		if (!pc->attr)
2401			return FALSE;
2402
2403		for (i = 0; i < pc->attr_nr * 4; ++i)
2404			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
2405	}
2406
2407	if (pc->result_nr) {
2408		unsigned nr = pc->result_nr * 4;
2409
2410		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
2411		if (!pc->result)
2412			return FALSE;
2413
2414		for (i = 0; i < nr; ++i)
2415			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
2416	}
2417
2418	if (pc->param_nr) {
2419		int rid = 0;
2420
2421		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
2422		if (!pc->param)
2423			return FALSE;
2424
2425		for (i = 0; i < pc->param_nr; ++i)
2426			for (c = 0; c < 4; ++c, ++rid)
2427				ctor_reg(&pc->param[rid], P_CONST, i, rid);
2428	}
2429
2430	return TRUE;
2431}
2432
2433static void
2434nv50_fp_move_results(struct nv50_pc *pc)
2435{
2436	struct nv50_reg reg;
2437	unsigned i;
2438
2439	ctor_reg(&reg, P_TEMP, -1, -1);
2440
2441	for (i = 0; i < pc->result_nr * 4; ++i) {
2442		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
2443			continue;
2444		if (pc->result[i].rhw != pc->result[i].hw) {
2445			reg.hw = pc->result[i].rhw;
2446			emit_mov(pc, &reg, &pc->result[i]);
2447		}
2448	}
2449}
2450
2451static void
2452nv50_program_fixup_insns(struct nv50_pc *pc)
2453{
2454	struct nv50_program_exec *e, *prev = NULL, **bra_list;
2455	unsigned i, n, pos;
2456
2457	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
2458
2459	/* Collect branch instructions, we need to adjust their offsets
2460	 * when converting 32 bit instructions to 64 bit ones
2461	 */
2462	for (n = 0, e = pc->p->exec_head; e; e = e->next)
2463		if (e->param.index >= 0 && !e->param.mask)
2464			bra_list[n++] = e;
2465
2466	/* Make sure we don't have any single 32 bit instructions. */
2467	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
2468		pos += is_long(e) ? 2 : 1;
2469
2470		if ((pos & 1) && (!e->next || is_long(e->next))) {
2471			for (i = 0; i < n; ++i)
2472				if (bra_list[i]->param.index >= pos)
2473					bra_list[i]->param.index += 1;
2474			convert_to_long(pc, e);
2475			++pos;
2476		}
2477		if (e->next)
2478			prev = e;
2479	}
2480
2481	assert(!is_immd(pc->p->exec_head));
2482	assert(!is_immd(pc->p->exec_tail));
2483
2484	/* last instruction must be long so it can have the end bit set */
2485	if (!is_long(pc->p->exec_tail)) {
2486		convert_to_long(pc, pc->p->exec_tail);
2487		if (prev)
2488			convert_to_long(pc, prev);
2489	}
2490	assert(!(pc->p->exec_tail->inst[1] & 2));
2491	/* set the end-bit */
2492	pc->p->exec_tail->inst[1] |= 1;
2493
2494	FREE(bra_list);
2495}
2496
2497static boolean
2498nv50_program_tx(struct nv50_program *p)
2499{
2500	struct tgsi_parse_context parse;
2501	struct nv50_pc *pc;
2502	boolean ret;
2503
2504	pc = CALLOC_STRUCT(nv50_pc);
2505	if (!pc)
2506		return FALSE;
2507
2508	ret = ctor_nv50_pc(pc, p);
2509	if (ret == FALSE)
2510		goto out_cleanup;
2511
2512	ret = nv50_program_tx_prep(pc);
2513	if (ret == FALSE)
2514		goto out_cleanup;
2515
2516	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2517	while (!tgsi_parse_end_of_tokens(&parse)) {
2518		const union tgsi_full_token *tok = &parse.FullToken;
2519
2520		/* don't allow half insn/immd on first and last instruction */
2521		pc->allow32 = TRUE;
2522		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2523			pc->allow32 = FALSE;
2524
2525		tgsi_parse_token(&parse);
2526
2527		switch (tok->Token.Type) {
2528		case TGSI_TOKEN_TYPE_INSTRUCTION:
2529			++pc->insn_cur;
2530			ret = nv50_tgsi_insn(pc, tok);
2531			if (ret == FALSE)
2532				goto out_err;
2533			break;
2534		default:
2535			break;
2536		}
2537	}
2538
2539	if (pc->p->type == PIPE_SHADER_FRAGMENT)
2540		nv50_fp_move_results(pc);
2541
2542	nv50_program_fixup_insns(pc);
2543
2544	p->param_nr = pc->param_nr * 4;
2545	p->immd_nr = pc->immd_nr * 4;
2546	p->immd = pc->immd_buf;
2547
2548out_err:
2549	tgsi_parse_free(&parse);
2550
2551out_cleanup:
2552	free_nv50_pc(pc);
2553	return ret;
2554}
2555
2556static void
2557nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2558{
2559	if (nv50_program_tx(p) == FALSE)
2560		assert(0);
2561	p->translated = TRUE;
2562}
2563
2564static void
2565nv50_program_upload_data(struct nv50_context *nv50, float *map,
2566			unsigned start, unsigned count, unsigned cbuf)
2567{
2568	struct nouveau_channel *chan = nv50->screen->base.channel;
2569	struct nouveau_grobj *tesla = nv50->screen->tesla;
2570
2571	while (count) {
2572		unsigned nr = count > 2047 ? 2047 : count;
2573
2574		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2575		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2576		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2577		OUT_RINGp (chan, map, nr);
2578
2579		map += nr;
2580		start += nr;
2581		count -= nr;
2582	}
2583}
2584
2585static void
2586nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2587{
2588	struct pipe_screen *pscreen = nv50->pipe.screen;
2589
2590	if (!p->data[0] && p->immd_nr) {
2591		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2592
2593		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2594			while (heap->next && heap->size < p->immd_nr) {
2595				struct nv50_program *evict = heap->next->priv;
2596				nouveau_resource_free(&evict->data[0]);
2597			}
2598
2599			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2600						   &p->data[0]))
2601				assert(0);
2602		}
2603
2604		/* immediates only need to be uploaded again when freed */
2605		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2606					 p->immd_nr, NV50_CB_PMISC);
2607	}
2608
2609	assert(p->param_nr <= 128);
2610
2611	if (p->param_nr) {
2612		unsigned cb;
2613		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2614					     PIPE_BUFFER_USAGE_CPU_READ);
2615
2616		if (p->type == PIPE_SHADER_VERTEX)
2617			cb = NV50_CB_PVP;
2618		else
2619			cb = NV50_CB_PFP;
2620
2621		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
2622		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2623	}
2624}
2625
2626static void
2627nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2628{
2629	struct nouveau_channel *chan = nv50->screen->base.channel;
2630	struct nouveau_grobj *tesla = nv50->screen->tesla;
2631	struct nv50_program_exec *e;
2632	struct nouveau_stateobj *so;
2633	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2634	unsigned start, count, *up, *ptr;
2635	boolean upload = FALSE;
2636
2637	if (!p->bo) {
2638		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2639			       p->exec_size * 4, &p->bo);
2640		upload = TRUE;
2641	}
2642
2643	if (p->data[0] && p->data[0]->start != p->data_start[0])
2644		upload = TRUE;
2645
2646	if (!upload)
2647		return;
2648
2649	for (e = p->exec_head; e; e = e->next) {
2650		unsigned ei, ci, bs;
2651
2652		if (e->param.index < 0)
2653			continue;
2654
2655		if (e->param.mask == 0) {
2656			assert(!(e->param.index & 1));
2657			/* seem to be 8 byte steps */
2658			ei = (e->param.index >> 1) + 0 /* START_ID */;
2659
2660			e->inst[0] &= 0xf0000fff;
2661			e->inst[0] |= ei << 12;
2662			continue;
2663		}
2664
2665		bs = (e->inst[1] >> 22) & 0x07;
2666		assert(bs < 2);
2667		ei = e->param.shift >> 5;
2668		ci = e->param.index;
2669		if (bs == 0)
2670			ci += p->data[bs]->start;
2671
2672		e->inst[ei] &= ~e->param.mask;
2673		e->inst[ei] |= (ci << e->param.shift);
2674	}
2675
2676	if (p->data[0])
2677		p->data_start[0] = p->data[0]->start;
2678
2679#ifdef NV50_PROGRAM_DUMP
2680	NOUVEAU_ERR("-------\n");
2681	for (e = p->exec_head; e; e = e->next) {
2682		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2683		if (is_long(e))
2684			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2685	}
2686#endif
2687
2688	up = ptr = MALLOC(p->exec_size * 4);
2689	for (e = p->exec_head; e; e = e->next) {
2690		*(ptr++) = e->inst[0];
2691		if (is_long(e))
2692			*(ptr++) = e->inst[1];
2693	}
2694
2695	so = so_new(4,2);
2696	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2697	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2698	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2699	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2700
2701	start = 0; count = p->exec_size;
2702	while (count) {
2703		struct nouveau_channel *chan = nv50->screen->base.channel;
2704		unsigned nr;
2705
2706		so_emit(chan, so);
2707
2708		nr = MIN2(count, 2047);
2709		nr = MIN2(chan->pushbuf->remaining, nr);
2710		if (chan->pushbuf->remaining < (nr + 3)) {
2711			FIRE_RING(chan);
2712			continue;
2713		}
2714
2715		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2716		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2717		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2718		OUT_RINGp (chan, up + start, nr);
2719
2720		start += nr;
2721		count -= nr;
2722	}
2723
2724	FREE(up);
2725	so_ref(NULL, &so);
2726}
2727
2728void
2729nv50_vertprog_validate(struct nv50_context *nv50)
2730{
2731	struct nouveau_grobj *tesla = nv50->screen->tesla;
2732	struct nv50_program *p = nv50->vertprog;
2733	struct nouveau_stateobj *so;
2734
2735	if (!p->translated) {
2736		nv50_program_validate(nv50, p);
2737		if (!p->translated)
2738			assert(0);
2739	}
2740
2741	nv50_program_validate_data(nv50, p);
2742	nv50_program_validate_code(nv50, p);
2743
2744	so = so_new(13, 2);
2745	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2746	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2747		      NOUVEAU_BO_HIGH, 0, 0);
2748	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2749		      NOUVEAU_BO_LOW, 0, 0);
2750	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2751	so_data  (so, p->cfg.attr[0]);
2752	so_data  (so, p->cfg.attr[1]);
2753	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2754	so_data  (so, p->cfg.high_result);
2755	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2756	so_data  (so, p->cfg.high_result); //8);
2757	so_data  (so, p->cfg.high_temp);
2758	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2759	so_data  (so, 0); /* program start offset */
2760	so_ref(so, &nv50->state.vertprog);
2761	so_ref(NULL, &so);
2762}
2763
2764void
2765nv50_fragprog_validate(struct nv50_context *nv50)
2766{
2767	struct nouveau_grobj *tesla = nv50->screen->tesla;
2768	struct nv50_program *p = nv50->fragprog;
2769	struct nouveau_stateobj *so;
2770
2771	if (!p->translated) {
2772		nv50_program_validate(nv50, p);
2773		if (!p->translated)
2774			assert(0);
2775	}
2776
2777	nv50_program_validate_data(nv50, p);
2778	nv50_program_validate_code(nv50, p);
2779
2780	so = so_new(64, 2);
2781	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2782	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2783		      NOUVEAU_BO_HIGH, 0, 0);
2784	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2785		      NOUVEAU_BO_LOW, 0, 0);
2786	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
2787	so_data  (so, p->cfg.high_temp);
2788	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2789	so_data  (so, p->cfg.high_result);
2790	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2791	so_data  (so, p->cfg.regs[2]);
2792	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2793	so_data  (so, p->cfg.regs[3]);
2794	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2795	so_data  (so, 0); /* program start offset */
2796	so_ref(so, &nv50->state.fragprog);
2797	so_ref(NULL, &so);
2798}
2799
2800static void
2801nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
2802{
2803	struct nv50_program *fp = nv50->fragprog;
2804	struct nv50_program *vp = nv50->vertprog;
2805	unsigned i, c, m = base;
2806
2807	/* XXX: This can't work correctly in all cases yet, we either
2808	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
2809	 * to be per FP input instead of per VP output
2810	 */
2811	memset(pntc, 0, 8 * sizeof(uint32_t));
2812
2813	for (i = 0; i < fp->cfg.io_nr; i++) {
2814		uint8_t sn, si;
2815		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
2816		unsigned n = popcnt4(fp->cfg.io[i].mask);
2817
2818		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
2819			m += n;
2820			continue;
2821		}
2822
2823		sn = vp->info.input_semantic_name[j];
2824		si = vp->info.input_semantic_index[j];
2825
2826		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
2827			ubyte mode =
2828				nv50->rasterizer->pipe.sprite_coord_mode[si];
2829
2830			if (mode == PIPE_SPRITE_COORD_NONE) {
2831				m += n;
2832				continue;
2833			}
2834		}
2835
2836		/* this is either PointCoord or replaced by sprite coords */
2837		for (c = 0; c < 4; c++) {
2838			if (!(fp->cfg.io[i].mask & (1 << c)))
2839				continue;
2840			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
2841			++m;
2842		}
2843	}
2844}
2845
2846static int
2847nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
2848	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
2849{
2850	int c;
2851	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
2852	uint8_t *map = (uint8_t *)p_map;
2853
2854	for (c = 0; c < 4; ++c) {
2855		if (mf & 1) {
2856			if (fpi->linear == TRUE)
2857				lin[mid / 32] |= 1 << (mid % 32);
2858			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
2859		}
2860
2861		oid += mv & 1;
2862		mf >>= 1;
2863		mv >>= 1;
2864	}
2865
2866	return mid;
2867}
2868
2869void
2870nv50_linkage_validate(struct nv50_context *nv50)
2871{
2872	struct nouveau_grobj *tesla = nv50->screen->tesla;
2873	struct nv50_program *vp = nv50->vertprog;
2874	struct nv50_program *fp = nv50->fragprog;
2875	struct nouveau_stateobj *so;
2876	struct nv50_sreg4 dummy, *vpo;
2877	int i, n, c, m = 0;
2878	uint32_t map[16], lin[4], reg[5], pcrd[8];
2879
2880	memset(map, 0, sizeof(map));
2881	memset(lin, 0, sizeof(lin));
2882
2883	reg[1] = 0x00000004; /* low and high clip distance map ids */
2884	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
2885	reg[3] = 0x00000000; /* point size map id & enable */
2886	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
2887	reg[4] = fp->cfg.regs[1]; /* interpolant info */
2888
2889	dummy.linear = FALSE;
2890	dummy.mask = 0xf; /* map all components of HPOS */
2891	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
2892
2893	dummy.mask = 0x0;
2894
2895	if (vp->cfg.clpd < 0x40) {
2896		for (c = 0; c < vp->cfg.clpd_nr; ++c)
2897			map[m++] = vp->cfg.clpd + c;
2898		reg[1] = (m << 8);
2899	}
2900
2901	reg[0] |= m << 8; /* adjust BFC0 id */
2902
2903	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
2904	if (nv50->rasterizer->pipe.light_twoside) {
2905		vpo = &vp->cfg.two_side[0];
2906
2907		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
2908		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
2909	}
2910
2911	reg[0] += m - 4; /* adjust FFC0 id */
2912	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
2913
2914	i = 0;
2915	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
2916		i = 1;
2917	for (; i < fp->cfg.io_nr; i++) {
2918		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
2919		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
2920
2921		n = fp->cfg.io[i].id_vp;
2922		if (n >= vp->cfg.io_nr ||
2923		    vp->info.output_semantic_name[n] != sn ||
2924		    vp->info.output_semantic_index[n] != si)
2925			vpo = &dummy;
2926		else
2927			vpo = &vp->cfg.io[n];
2928
2929		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
2930	}
2931
2932	if (nv50->rasterizer->pipe.point_size_per_vertex) {
2933		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
2934		reg[3] = (m++ << 4) | 1;
2935	}
2936
2937	/* now fill the stateobj */
2938	so = so_new(64, 0);
2939
2940	n = (m + 3) / 4;
2941	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
2942	so_data  (so, m);
2943	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
2944	so_datap (so, map, n);
2945
2946	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2947	so_datap (so, reg, 4);
2948
2949	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
2950	so_data  (so, reg[4]);
2951
2952	so_method(so, tesla, 0x1540, 4);
2953	so_datap (so, lin, 4);
2954
2955	if (nv50->rasterizer->pipe.point_sprite) {
2956		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
2957
2958		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
2959		so_datap (so, pcrd, 8);
2960	}
2961
2962        so_ref(so, &nv50->state.programs);
2963        so_ref(NULL, &so);
2964}
2965
2966void
2967nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2968{
2969	while (p->exec_head) {
2970		struct nv50_program_exec *e = p->exec_head;
2971
2972		p->exec_head = e->next;
2973		FREE(e);
2974	}
2975	p->exec_tail = NULL;
2976	p->exec_size = 0;
2977
2978	nouveau_bo_ref(NULL, &p->bo);
2979
2980	nouveau_resource_free(&p->data[0]);
2981
2982	p->translated = 0;
2983}
2984