nv50_program.c revision 3accd7ebe971624bed5624f73ed3522c9de4c193
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93struct nv50_pc {
94	struct nv50_program *p;
95
96	/* hw resources */
97	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	unsigned interp_mode[32];
116	/* perspective interpolation registers */
117	struct nv50_reg *iv_p;
118	struct nv50_reg *iv_c;
119
120	/* current instruction and total number of insns */
121	unsigned insn_cur;
122	unsigned insn_nr;
123
124	boolean allow32;
125};
126
127static void
128alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
129{
130	int i = 0;
131
132	if (reg->type == P_RESULT) {
133		if (pc->p->cfg.high_result < (reg->hw + 1))
134			pc->p->cfg.high_result = reg->hw + 1;
135	}
136
137	if (reg->type != P_TEMP)
138		return;
139
140	if (reg->hw >= 0) {
141		/*XXX: do this here too to catch FP temp-as-attr usage..
142		 *     not clean, but works */
143		if (pc->p->cfg.high_temp < (reg->hw + 1))
144			pc->p->cfg.high_temp = reg->hw + 1;
145		return;
146	}
147
148	if (reg->rhw != -1) {
149		/* try to allocate temporary with index rhw first */
150		if (!(pc->r_temp[reg->rhw])) {
151			pc->r_temp[reg->rhw] = reg;
152			reg->hw = reg->rhw;
153			if (pc->p->cfg.high_temp < (reg->rhw + 1))
154				pc->p->cfg.high_temp = reg->rhw + 1;
155			return;
156		}
157		/* make sure we don't get things like $r0 needs to go
158		 * in $r1 and $r1 in $r0
159		 */
160		i = pc->result_nr * 4;
161	}
162
163	for (; i < NV50_SU_MAX_TEMP; i++) {
164		if (!(pc->r_temp[i])) {
165			pc->r_temp[i] = reg;
166			reg->hw = i;
167			if (pc->p->cfg.high_temp < (i + 1))
168				pc->p->cfg.high_temp = i + 1;
169			return;
170		}
171	}
172
173	assert(0);
174}
175
176static struct nv50_reg *
177alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
178{
179	struct nv50_reg *r;
180	int i;
181
182	if (dst && dst->type == P_TEMP && dst->hw == -1)
183		return dst;
184
185	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
186		if (!pc->r_temp[i]) {
187			r = CALLOC_STRUCT(nv50_reg);
188			r->type = P_TEMP;
189			r->index = -1;
190			r->hw = i;
191			r->rhw = -1;
192			pc->r_temp[i] = r;
193			return r;
194		}
195	}
196
197	assert(0);
198	return NULL;
199}
200
201/* Assign the hw of the discarded temporary register src
202 * to the tgsi register dst and free src.
203 */
204static void
205assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
206{
207	assert(src->index == -1 && src->hw != -1);
208
209	if (dst->hw != -1)
210		pc->r_temp[dst->hw] = NULL;
211	pc->r_temp[src->hw] = dst;
212	dst->hw = src->hw;
213
214	FREE(src);
215}
216
217/* release the hardware resource held by r */
218static void
219release_hw(struct nv50_pc *pc, struct nv50_reg *r)
220{
221	assert(r->type == P_TEMP);
222	if (r->hw == -1)
223		return;
224
225	assert(pc->r_temp[r->hw] == r);
226	pc->r_temp[r->hw] = NULL;
227
228	r->acc = 0;
229	if (r->index == -1)
230		FREE(r);
231}
232
233static void
234free_temp(struct nv50_pc *pc, struct nv50_reg *r)
235{
236	if (r->index == -1) {
237		unsigned hw = r->hw;
238
239		FREE(pc->r_temp[hw]);
240		pc->r_temp[hw] = NULL;
241	}
242}
243
244static int
245alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
246{
247	int i;
248
249	if ((idx + 4) >= NV50_SU_MAX_TEMP)
250		return 1;
251
252	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
253	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
254		return alloc_temp4(pc, dst, idx + 1);
255
256	for (i = 0; i < 4; i++) {
257		dst[i] = CALLOC_STRUCT(nv50_reg);
258		dst[i]->type = P_TEMP;
259		dst[i]->index = -1;
260		dst[i]->hw = idx + i;
261		pc->r_temp[idx + i] = dst[i];
262	}
263
264	return 0;
265}
266
267static void
268free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
269{
270	int i;
271
272	for (i = 0; i < 4; i++)
273		free_temp(pc, reg[i]);
274}
275
276static struct nv50_reg *
277temp_temp(struct nv50_pc *pc)
278{
279	if (pc->temp_temp_nr >= 16)
280		assert(0);
281
282	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
283	return pc->temp_temp[pc->temp_temp_nr++];
284}
285
286static void
287kill_temp_temp(struct nv50_pc *pc)
288{
289	int i;
290
291	for (i = 0; i < pc->temp_temp_nr; i++)
292		free_temp(pc, pc->temp_temp[i]);
293	pc->temp_temp_nr = 0;
294}
295
296static int
297ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
298{
299	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
300			       (pc->immd_nr + 1) * 4 * sizeof(float));
301	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
302	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
303	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
304	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
305
306	return pc->immd_nr++;
307}
308
309static struct nv50_reg *
310alloc_immd(struct nv50_pc *pc, float f)
311{
312	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
313	unsigned hw;
314
315	for (hw = 0; hw < pc->immd_nr * 4; hw++)
316		if (pc->immd_buf[hw] == f)
317			break;
318
319	if (hw == pc->immd_nr * 4)
320		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
321
322	r->type = P_IMMD;
323	r->hw = hw;
324	r->index = -1;
325	return r;
326}
327
328static struct nv50_program_exec *
329exec(struct nv50_pc *pc)
330{
331	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
332
333	e->param.index = -1;
334	return e;
335}
336
337static void
338emit(struct nv50_pc *pc, struct nv50_program_exec *e)
339{
340	struct nv50_program *p = pc->p;
341
342	if (p->exec_tail)
343		p->exec_tail->next = e;
344	if (!p->exec_head)
345		p->exec_head = e;
346	p->exec_tail = e;
347	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
348}
349
350static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
351
352static boolean
353is_long(struct nv50_program_exec *e)
354{
355	if (e->inst[0] & 1)
356		return TRUE;
357	return FALSE;
358}
359
360static boolean
361is_immd(struct nv50_program_exec *e)
362{
363	if (is_long(e) && (e->inst[1] & 3) == 3)
364		return TRUE;
365	return FALSE;
366}
367
368static INLINE void
369set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
370	 struct nv50_program_exec *e)
371{
372	set_long(pc, e);
373	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
374	e->inst[1] |= (pred << 7) | (idx << 12);
375}
376
377static INLINE void
378set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
379	    struct nv50_program_exec *e)
380{
381	set_long(pc, e);
382	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
383	e->inst[1] |= (idx << 4) | (on << 6);
384}
385
386static INLINE void
387set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
388{
389	if (is_long(e))
390		return;
391
392	e->inst[0] |= 1;
393	set_pred(pc, 0xf, 0, e);
394	set_pred_wr(pc, 0, 0, e);
395}
396
397static INLINE void
398set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
399{
400	if (dst->type == P_RESULT) {
401		set_long(pc, e);
402		e->inst[1] |= 0x00000008;
403	}
404
405	alloc_reg(pc, dst);
406	e->inst[0] |= (dst->hw << 2);
407}
408
409static INLINE void
410set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
411{
412	unsigned val = fui(pc->immd_buf[imm->hw]);
413
414	set_long(pc, e);
415	/*XXX: can't be predicated - bits overlap.. catch cases where both
416	 *     are required and avoid them. */
417	set_pred(pc, 0, 0, e);
418	set_pred_wr(pc, 0, 0, e);
419
420	e->inst[1] |= 0x00000002 | 0x00000001;
421	e->inst[0] |= (val & 0x3f) << 16;
422	e->inst[1] |= (val >> 6) << 2;
423}
424
425
426#define INTERP_LINEAR		0
427#define INTERP_FLAT			1
428#define INTERP_PERSPECTIVE	2
429#define INTERP_CENTROID		4
430
431/* interpolant index has been stored in dst->rhw */
432static void
433emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
434		unsigned mode)
435{
436	assert(dst->rhw != -1);
437	struct nv50_program_exec *e = exec(pc);
438
439	e->inst[0] |= 0x80000000;
440	set_dst(pc, dst, e);
441	e->inst[0] |= (dst->rhw << 16);
442
443	if (mode & INTERP_FLAT) {
444		e->inst[0] |= (1 << 8);
445	} else {
446		if (mode & INTERP_PERSPECTIVE) {
447			e->inst[0] |= (1 << 25);
448			alloc_reg(pc, iv);
449			e->inst[0] |= (iv->hw << 9);
450		}
451
452		if (mode & INTERP_CENTROID)
453			e->inst[0] |= (1 << 24);
454	}
455
456	emit(pc, e);
457}
458
459static void
460set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
461	 struct nv50_program_exec *e)
462{
463	set_long(pc, e);
464
465	e->param.index = src->hw;
466	e->param.shift = s;
467	e->param.mask = m << (s % 32);
468
469	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
470}
471
472static void
473emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
474{
475	struct nv50_program_exec *e = exec(pc);
476
477	e->inst[0] |= 0x10000000;
478
479	set_dst(pc, dst, e);
480
481	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
482		set_immd(pc, src, e);
483		/*XXX: 32-bit, but steals part of "half" reg space - need to
484		 *     catch and handle this case if/when we do half-regs
485		 */
486	} else
487	if (src->type == P_IMMD || src->type == P_CONST) {
488		set_long(pc, e);
489		set_data(pc, src, 0x7f, 9, e);
490		e->inst[1] |= 0x20000000; /* src0 const? */
491	} else {
492		if (src->type == P_ATTR) {
493			set_long(pc, e);
494			e->inst[1] |= 0x00200000;
495		}
496
497		alloc_reg(pc, src);
498		e->inst[0] |= (src->hw << 9);
499	}
500
501	if (is_long(e) && !is_immd(e)) {
502		e->inst[1] |= 0x04000000; /* 32-bit */
503		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
504		if (!(e->inst[1] & 0x20000000))
505			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
506	} else
507		e->inst[0] |= 0x00008000;
508
509	emit(pc, e);
510}
511
512static INLINE void
513emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
514{
515	struct nv50_reg *imm = alloc_immd(pc, f);
516	emit_mov(pc, dst, imm);
517	FREE(imm);
518}
519
520static boolean
521check_swap_src_0_1(struct nv50_pc *pc,
522		   struct nv50_reg **s0, struct nv50_reg **s1)
523{
524	struct nv50_reg *src0 = *s0, *src1 = *s1;
525
526	if (src0->type == P_CONST) {
527		if (src1->type != P_CONST) {
528			*s0 = src1;
529			*s1 = src0;
530			return TRUE;
531		}
532	} else
533	if (src1->type == P_ATTR) {
534		if (src0->type != P_ATTR) {
535			*s0 = src1;
536			*s1 = src0;
537			return TRUE;
538		}
539	}
540
541	return FALSE;
542}
543
544static void
545set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
546{
547	if (src->type == P_ATTR) {
548		set_long(pc, e);
549		e->inst[1] |= 0x00200000;
550	} else
551	if (src->type == P_CONST || src->type == P_IMMD) {
552		struct nv50_reg *temp = temp_temp(pc);
553
554		emit_mov(pc, temp, src);
555		src = temp;
556	}
557
558	alloc_reg(pc, src);
559	e->inst[0] |= (src->hw << 9);
560}
561
562static void
563set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
564{
565	if (src->type == P_ATTR) {
566		struct nv50_reg *temp = temp_temp(pc);
567
568		emit_mov(pc, temp, src);
569		src = temp;
570	} else
571	if (src->type == P_CONST || src->type == P_IMMD) {
572		assert(!(e->inst[0] & 0x00800000));
573		if (e->inst[0] & 0x01000000) {
574			struct nv50_reg *temp = temp_temp(pc);
575
576			emit_mov(pc, temp, src);
577			src = temp;
578		} else {
579			set_data(pc, src, 0x7f, 16, e);
580			e->inst[0] |= 0x00800000;
581		}
582	}
583
584	alloc_reg(pc, src);
585	e->inst[0] |= (src->hw << 16);
586}
587
588static void
589set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
590{
591	set_long(pc, e);
592
593	if (src->type == P_ATTR) {
594		struct nv50_reg *temp = temp_temp(pc);
595
596		emit_mov(pc, temp, src);
597		src = temp;
598	} else
599	if (src->type == P_CONST || src->type == P_IMMD) {
600		assert(!(e->inst[0] & 0x01000000));
601		if (e->inst[0] & 0x00800000) {
602			struct nv50_reg *temp = temp_temp(pc);
603
604			emit_mov(pc, temp, src);
605			src = temp;
606		} else {
607			set_data(pc, src, 0x7f, 32+14, e);
608			e->inst[0] |= 0x01000000;
609		}
610	}
611
612	alloc_reg(pc, src);
613	e->inst[1] |= (src->hw << 14);
614}
615
616static void
617emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
618	 struct nv50_reg *src1)
619{
620	struct nv50_program_exec *e = exec(pc);
621
622	e->inst[0] |= 0xc0000000;
623
624	if (!pc->allow32)
625		set_long(pc, e);
626
627	check_swap_src_0_1(pc, &src0, &src1);
628	set_dst(pc, dst, e);
629	set_src_0(pc, src0, e);
630	if (src1->type == P_IMMD && !is_long(e))
631		set_immd(pc, src1, e);
632	else
633		set_src_1(pc, src1, e);
634
635	emit(pc, e);
636}
637
638static void
639emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
640	 struct nv50_reg *src0, struct nv50_reg *src1)
641{
642	struct nv50_program_exec *e = exec(pc);
643
644	e->inst[0] |= 0xb0000000;
645
646	if (!pc->allow32)
647		set_long(pc, e);
648
649	check_swap_src_0_1(pc, &src0, &src1);
650	set_dst(pc, dst, e);
651	set_src_0(pc, src0, e);
652	if (is_long(e) || src1->type == P_CONST || src1->type == P_ATTR)
653		set_src_2(pc, src1, e);
654	else
655	if (src1->type == P_IMMD)
656		set_immd(pc, src1, e);
657	else
658		set_src_1(pc, src1, e);
659
660	emit(pc, e);
661}
662
663static void
664emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
665	    struct nv50_reg *src0, struct nv50_reg *src1)
666{
667	struct nv50_program_exec *e = exec(pc);
668
669	set_long(pc, e);
670	e->inst[0] |= 0xb0000000;
671	e->inst[1] |= (sub << 29);
672
673	check_swap_src_0_1(pc, &src0, &src1);
674	set_dst(pc, dst, e);
675	set_src_0(pc, src0, e);
676	set_src_1(pc, src1, e);
677
678	emit(pc, e);
679}
680
681static void
682emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
683	 struct nv50_reg *src1)
684{
685	struct nv50_program_exec *e = exec(pc);
686
687	e->inst[0] |= 0xb0000000;
688
689	set_long(pc, e);
690	if (check_swap_src_0_1(pc, &src0, &src1))
691		e->inst[1] |= 0x04000000;
692	else
693		e->inst[1] |= 0x08000000;
694
695	set_dst(pc, dst, e);
696	set_src_0(pc, src0, e);
697	set_src_2(pc, src1, e);
698
699	emit(pc, e);
700}
701
702static void
703emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
704	 struct nv50_reg *src1, struct nv50_reg *src2)
705{
706	struct nv50_program_exec *e = exec(pc);
707
708	e->inst[0] |= 0xe0000000;
709
710	check_swap_src_0_1(pc, &src0, &src1);
711	set_dst(pc, dst, e);
712	set_src_0(pc, src0, e);
713	set_src_1(pc, src1, e);
714	set_src_2(pc, src2, e);
715
716	emit(pc, e);
717}
718
719static void
720emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
721	 struct nv50_reg *src1, struct nv50_reg *src2)
722{
723	struct nv50_program_exec *e = exec(pc);
724
725	e->inst[0] |= 0xe0000000;
726	set_long(pc, e);
727	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
728
729	check_swap_src_0_1(pc, &src0, &src1);
730	set_dst(pc, dst, e);
731	set_src_0(pc, src0, e);
732	set_src_1(pc, src1, e);
733	set_src_2(pc, src2, e);
734
735	emit(pc, e);
736}
737
738static void
739emit_flop(struct nv50_pc *pc, unsigned sub,
740	  struct nv50_reg *dst, struct nv50_reg *src)
741{
742	struct nv50_program_exec *e = exec(pc);
743
744	e->inst[0] |= 0x90000000;
745	if (sub) {
746		set_long(pc, e);
747		e->inst[1] |= (sub << 29);
748	}
749
750	set_dst(pc, dst, e);
751	set_src_0(pc, src, e);
752
753	emit(pc, e);
754}
755
756static void
757emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
758{
759	struct nv50_program_exec *e = exec(pc);
760
761	e->inst[0] |= 0xb0000000;
762
763	set_dst(pc, dst, e);
764	set_src_0(pc, src, e);
765	set_long(pc, e);
766	e->inst[1] |= (6 << 29) | 0x00004000;
767
768	emit(pc, e);
769}
770
771static void
772emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
773{
774	struct nv50_program_exec *e = exec(pc);
775
776	e->inst[0] |= 0xb0000000;
777
778	set_dst(pc, dst, e);
779	set_src_0(pc, src, e);
780	set_long(pc, e);
781	e->inst[1] |= (6 << 29);
782
783	emit(pc, e);
784}
785
786static void
787emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
788	 struct nv50_reg *src0, struct nv50_reg *src1)
789{
790	struct nv50_program_exec *e = exec(pc);
791	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
792	struct nv50_reg *rdst;
793
794	assert(c_op <= 7);
795	if (check_swap_src_0_1(pc, &src0, &src1))
796		c_op = inv_cop[c_op];
797
798	rdst = dst;
799	if (dst->type != P_TEMP)
800		dst = alloc_temp(pc, NULL);
801
802	/* set.u32 */
803	set_long(pc, e);
804	e->inst[0] |= 0xb0000000;
805	e->inst[1] |= (3 << 29);
806	e->inst[1] |= (c_op << 14);
807	/*XXX: breaks things, .u32 by default?
808	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
809	 *     doesn't seem to match what the hw actually does.
810	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
811	 */
812	set_dst(pc, dst, e);
813	set_src_0(pc, src0, e);
814	set_src_1(pc, src1, e);
815	emit(pc, e);
816
817	/* cvt.f32.u32 */
818	e = exec(pc);
819	e->inst[0] = 0xa0000001;
820	e->inst[1] = 0x64014780;
821	set_dst(pc, rdst, e);
822	set_src_0(pc, dst, e);
823	emit(pc, e);
824
825	if (dst != rdst)
826		free_temp(pc, dst);
827}
828
829static void
830emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
831{
832	struct nv50_program_exec *e = exec(pc);
833
834	e->inst[0] = 0xa0000000; /* cvt */
835	set_long(pc, e);
836	e->inst[1] |= (6 << 29); /* cvt */
837	e->inst[1] |= 0x08000000; /* integer mode */
838	e->inst[1] |= 0x04000000; /* 32 bit */
839	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
840	e->inst[1] |= (1 << 14); /* src .f32 */
841	set_dst(pc, dst, e);
842	set_src_0(pc, src, e);
843
844	emit(pc, e);
845}
846
847static void
848emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
849	 struct nv50_reg *v, struct nv50_reg *e)
850{
851	struct nv50_reg *temp = alloc_temp(pc, NULL);
852
853	emit_flop(pc, 3, temp, v);
854	emit_mul(pc, temp, temp, e);
855	emit_preex2(pc, temp, temp);
856	emit_flop(pc, 6, dst, temp);
857
858	free_temp(pc, temp);
859}
860
861static void
862emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
863{
864	struct nv50_program_exec *e = exec(pc);
865
866	e->inst[0] = 0xa0000000; /* cvt */
867	set_long(pc, e);
868	e->inst[1] |= (6 << 29); /* cvt */
869	e->inst[1] |= 0x04000000; /* 32 bit */
870	e->inst[1] |= (1 << 14); /* src .f32 */
871	e->inst[1] |= ((1 << 6) << 14); /* .abs */
872	set_dst(pc, dst, e);
873	set_src_0(pc, src, e);
874
875	emit(pc, e);
876}
877
878static void
879emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
880	 struct nv50_reg **src)
881{
882	struct nv50_reg *one = alloc_immd(pc, 1.0);
883	struct nv50_reg *zero = alloc_immd(pc, 0.0);
884	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
885	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
886	struct nv50_reg *tmp[4];
887	boolean allow32 = pc->allow32;
888
889	pc->allow32 = FALSE;
890
891	if (mask & (3 << 1)) {
892		tmp[0] = alloc_temp(pc, NULL);
893		emit_minmax(pc, 4, tmp[0], src[0], zero);
894	}
895
896	if (mask & (1 << 2)) {
897		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
898
899		tmp[1] = temp_temp(pc);
900		emit_minmax(pc, 4, tmp[1], src[1], zero);
901
902		tmp[3] = temp_temp(pc);
903		emit_minmax(pc, 4, tmp[3], src[3], neg128);
904		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
905
906		emit_pow(pc, dst[2], tmp[1], tmp[3]);
907		emit_mov(pc, dst[2], zero);
908		set_pred(pc, 3, 0, pc->p->exec_tail);
909	}
910
911	if (mask & (1 << 1))
912		assimilate_temp(pc, dst[1], tmp[0]);
913	else
914	if (mask & (1 << 2))
915		free_temp(pc, tmp[0]);
916
917	pc->allow32 = allow32;
918
919	/* do this last, in case src[i,j] == dst[0,3] */
920	if (mask & (1 << 0))
921		emit_mov(pc, dst[0], one);
922
923	if (mask & (1 << 3))
924		emit_mov(pc, dst[3], one);
925
926	FREE(pos128);
927	FREE(neg128);
928	FREE(zero);
929	FREE(one);
930}
931
932static void
933emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
934{
935	struct nv50_program_exec *e = exec(pc);
936
937	set_long(pc, e);
938	e->inst[0] |= 0xa0000000; /* delta */
939	e->inst[1] |= (7 << 29); /* delta */
940	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
941	e->inst[1] |= (1 << 14); /* src .f32 */
942	set_dst(pc, dst, e);
943	set_src_0(pc, src, e);
944
945	emit(pc, e);
946}
947
948static void
949emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
950{
951	struct nv50_program_exec *e;
952	const int r_pred = 1;
953
954	/* Sets predicate reg ? */
955	e = exec(pc);
956	e->inst[0] = 0xa00001fd;
957	e->inst[1] = 0xc4014788;
958	set_src_0(pc, src, e);
959	set_pred_wr(pc, 1, r_pred, e);
960	emit(pc, e);
961
962	/* This is probably KILP */
963	e = exec(pc);
964	e->inst[0] = 0x000001fe;
965	set_long(pc, e);
966	set_pred(pc, 1 /* LT? */, r_pred, e);
967	emit(pc, e);
968}
969
970static void
971emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
972	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
973{
974	struct nv50_reg *temp, *t[4];
975	struct nv50_program_exec *e;
976
977	unsigned c, mode, dim;
978
979	switch (type) {
980	case TGSI_TEXTURE_1D:
981		dim = 1;
982		break;
983	case TGSI_TEXTURE_UNKNOWN:
984	case TGSI_TEXTURE_2D:
985	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
986	case TGSI_TEXTURE_RECT:
987		dim = 2;
988		break;
989	case TGSI_TEXTURE_3D:
990	case TGSI_TEXTURE_CUBE:
991	case TGSI_TEXTURE_SHADOW2D:
992	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
993		dim = 3;
994		break;
995	default:
996		assert(0);
997		break;
998	}
999
1000	alloc_temp4(pc, t, 0);
1001
1002	if (proj) {
1003		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1004			mode = pc->interp_mode[src[0]->index];
1005
1006			t[3]->rhw = src[3]->rhw;
1007			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1008			emit_flop(pc, 0, t[3], t[3]);
1009
1010			for (c = 0; c < dim; c++) {
1011				t[c]->rhw = src[c]->rhw;
1012				emit_interp(pc, t[c], t[3],
1013					    (mode | INTERP_PERSPECTIVE));
1014			}
1015		} else {
1016			emit_flop(pc, 0, t[3], src[3]);
1017			for (c = 0; c < dim; c++)
1018				emit_mul(pc, t[c], src[c], t[3]);
1019
1020			/* XXX: for some reason the blob sometimes uses MAD:
1021			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1022			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1023			 */
1024		}
1025	} else {
1026		if (type == TGSI_TEXTURE_CUBE) {
1027			temp = temp_temp(pc);
1028			emit_minmax(pc, 4, temp, src[0], src[1]);
1029			emit_minmax(pc, 4, temp, temp, src[2]);
1030			emit_flop(pc, 0, temp, temp);
1031			for (c = 0; c < 3; c++)
1032				emit_mul(pc, t[c], src[c], temp);
1033		} else {
1034			for (c = 0; c < dim; c++)
1035				emit_mov(pc, t[c], src[c]);
1036		}
1037	}
1038
1039	e = exec(pc);
1040	set_long(pc, e);
1041	e->inst[0] |= 0xf0000000;
1042	e->inst[1] |= 0x00000004;
1043	set_dst(pc, t[0], e);
1044	e->inst[0] |= (unit << 9);
1045
1046	if (dim == 2)
1047		e->inst[0] |= 0x00400000;
1048	else
1049	if (dim == 3)
1050		e->inst[0] |= 0x00800000;
1051
1052	e->inst[0] |= (mask & 0x3) << 25;
1053	e->inst[1] |= (mask & 0xc) << 12;
1054
1055	emit(pc, e);
1056
1057#if 1
1058	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1059	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1060	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1061	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1062
1063	free_temp4(pc, t);
1064#else
1065	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1066	 * the texture coordinates, not the fetched values: latency ? */
1067
1068	for (c = 0; c < 4; c++) {
1069		if (mask & (1 << c))
1070			assimilate_temp(pc, dst[c], t[c]);
1071		else
1072			free_temp(pc, t[c]);
1073	}
1074#endif
1075}
1076
1077static void
1078convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1079{
1080	unsigned q = 0, m = ~0;
1081
1082	assert(!is_long(e));
1083
1084	switch (e->inst[0] >> 28) {
1085	case 0x1:
1086		/* MOV */
1087		q = 0x0403c000;
1088		m = 0xffff7fff;
1089		break;
1090	case 0x8:
1091		/* INTERP */
1092		m = ~0x02000000;
1093		if (e->inst[0] & 0x02000000)
1094			q = 0x00020000;
1095		break;
1096	case 0x9:
1097		/* RCP */
1098		break;
1099	case 0xB:
1100		/* ADD */
1101		m = ~(127 << 16);
1102		q = ((e->inst[0] & (~m)) >> 2);
1103		break;
1104	case 0xC:
1105		/* MUL */
1106		m = ~0x00008000;
1107		q = ((e->inst[0] & (~m)) << 12);
1108		break;
1109	case 0xE:
1110		/* MAD (if src2 == dst) */
1111		q = ((e->inst[0] & 0x1fc) << 12);
1112		break;
1113	default:
1114		assert(0);
1115		break;
1116	}
1117
1118	set_long(pc, e);
1119	pc->p->exec_size++;
1120
1121	e->inst[0] &= m;
1122	e->inst[1] |= q;
1123}
1124
1125static struct nv50_reg *
1126tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1127{
1128	switch (dst->DstRegister.File) {
1129	case TGSI_FILE_TEMPORARY:
1130		return &pc->temp[dst->DstRegister.Index * 4 + c];
1131	case TGSI_FILE_OUTPUT:
1132		return &pc->result[dst->DstRegister.Index * 4 + c];
1133	case TGSI_FILE_NULL:
1134		return NULL;
1135	default:
1136		break;
1137	}
1138
1139	return NULL;
1140}
1141
1142static struct nv50_reg *
1143tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
1144{
1145	struct nv50_reg *r = NULL;
1146	struct nv50_reg *temp;
1147	unsigned sgn, c;
1148
1149	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1150
1151	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1152	switch (c) {
1153	case TGSI_EXTSWIZZLE_X:
1154	case TGSI_EXTSWIZZLE_Y:
1155	case TGSI_EXTSWIZZLE_Z:
1156	case TGSI_EXTSWIZZLE_W:
1157		switch (src->SrcRegister.File) {
1158		case TGSI_FILE_INPUT:
1159			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1160			break;
1161		case TGSI_FILE_TEMPORARY:
1162			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1163			break;
1164		case TGSI_FILE_CONSTANT:
1165			r = &pc->param[src->SrcRegister.Index * 4 + c];
1166			break;
1167		case TGSI_FILE_IMMEDIATE:
1168			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1169			break;
1170		case TGSI_FILE_SAMPLER:
1171			break;
1172		default:
1173			assert(0);
1174			break;
1175		}
1176		break;
1177	case TGSI_EXTSWIZZLE_ZERO:
1178		r = alloc_immd(pc, 0.0);
1179		return r;
1180	case TGSI_EXTSWIZZLE_ONE:
1181		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1182			return alloc_immd(pc, -1.0);
1183		return alloc_immd(pc, 1.0);
1184	default:
1185		assert(0);
1186		break;
1187	}
1188
1189	switch (sgn) {
1190	case TGSI_UTIL_SIGN_KEEP:
1191		break;
1192	case TGSI_UTIL_SIGN_CLEAR:
1193		temp = temp_temp(pc);
1194		emit_abs(pc, temp, r);
1195		r = temp;
1196		break;
1197	case TGSI_UTIL_SIGN_TOGGLE:
1198		temp = temp_temp(pc);
1199		emit_neg(pc, temp, r);
1200		r = temp;
1201		break;
1202	case TGSI_UTIL_SIGN_SET:
1203		temp = temp_temp(pc);
1204		emit_abs(pc, temp, r);
1205		emit_neg(pc, temp, temp);
1206		r = temp;
1207		break;
1208	default:
1209		assert(0);
1210		break;
1211	}
1212
1213	return r;
1214}
1215
1216/* returns TRUE if instruction can overwrite sources before they're read */
1217static boolean
1218direct2dest_op(const struct tgsi_full_instruction *insn)
1219{
1220	if (insn->Instruction.Saturate)
1221		return FALSE;
1222
1223	switch (insn->Instruction.Opcode) {
1224	case TGSI_OPCODE_COS:
1225	case TGSI_OPCODE_DP3:
1226	case TGSI_OPCODE_DP4:
1227	case TGSI_OPCODE_DPH:
1228	case TGSI_OPCODE_KIL:
1229	case TGSI_OPCODE_LIT:
1230	case TGSI_OPCODE_POW:
1231	case TGSI_OPCODE_RCP:
1232	case TGSI_OPCODE_RSQ:
1233	case TGSI_OPCODE_SCS:
1234	case TGSI_OPCODE_SIN:
1235	case TGSI_OPCODE_TEX:
1236	case TGSI_OPCODE_TXP:
1237		return FALSE;
1238	default:
1239		return TRUE;
1240	}
1241}
1242
1243static boolean
1244nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1245{
1246	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1247	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1248	unsigned mask, sat, unit;
1249	boolean assimilate = FALSE;
1250	int i, c;
1251
1252	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1253	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1254
1255	for (c = 0; c < 4; c++) {
1256		if (mask & (1 << c))
1257			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1258		else
1259			dst[c] = NULL;
1260		rdst[c] = NULL;
1261		src[0][c] = NULL;
1262		src[1][c] = NULL;
1263		src[2][c] = NULL;
1264	}
1265
1266	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1267		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1268
1269		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1270			unit = fs->SrcRegister.Index;
1271
1272		for (c = 0; c < 4; c++)
1273			src[i][c] = tgsi_src(pc, c, fs);
1274	}
1275
1276	if (sat) {
1277		for (c = 0; c < 4; c++) {
1278			rdst[c] = dst[c];
1279			dst[c] = temp_temp(pc);
1280		}
1281	} else
1282	if (direct2dest_op(inst)) {
1283		for (c = 0; c < 4; c++) {
1284			if (!dst[c] || dst[c]->type != P_TEMP)
1285				continue;
1286
1287			for (i = c + 1; i < 4; i++) {
1288				if (dst[c] == src[0][i] ||
1289				    dst[c] == src[1][i] ||
1290				    dst[c] == src[2][i])
1291					break;
1292			}
1293			if (i == 4)
1294				continue;
1295
1296			assimilate = TRUE;
1297			rdst[c] = dst[c];
1298			dst[c] = alloc_temp(pc, NULL);
1299		}
1300	}
1301
1302	switch (inst->Instruction.Opcode) {
1303	case TGSI_OPCODE_ABS:
1304		for (c = 0; c < 4; c++) {
1305			if (!(mask & (1 << c)))
1306				continue;
1307			emit_abs(pc, dst[c], src[0][c]);
1308		}
1309		break;
1310	case TGSI_OPCODE_ADD:
1311		for (c = 0; c < 4; c++) {
1312			if (!(mask & (1 << c)))
1313				continue;
1314			emit_add(pc, dst[c], src[0][c], src[1][c]);
1315		}
1316		break;
1317	case TGSI_OPCODE_COS:
1318		temp = temp_temp(pc);
1319		emit_precossin(pc, temp, src[0][0]);
1320		emit_flop(pc, 5, temp, temp);
1321		for (c = 0; c < 4; c++) {
1322			if (!(mask & (1 << c)))
1323				continue;
1324			emit_mov(pc, dst[c], temp);
1325		}
1326		break;
1327	case TGSI_OPCODE_DP3:
1328		temp = temp_temp(pc);
1329		emit_mul(pc, temp, src[0][0], src[1][0]);
1330		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1331		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1332		for (c = 0; c < 4; c++) {
1333			if (!(mask & (1 << c)))
1334				continue;
1335			emit_mov(pc, dst[c], temp);
1336		}
1337		break;
1338	case TGSI_OPCODE_DP4:
1339		temp = temp_temp(pc);
1340		emit_mul(pc, temp, src[0][0], src[1][0]);
1341		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1342		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1343		emit_mad(pc, temp, src[0][3], src[1][3], temp);
1344		for (c = 0; c < 4; c++) {
1345			if (!(mask & (1 << c)))
1346				continue;
1347			emit_mov(pc, dst[c], temp);
1348		}
1349		break;
1350	case TGSI_OPCODE_DPH:
1351		temp = temp_temp(pc);
1352		emit_mul(pc, temp, src[0][0], src[1][0]);
1353		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1354		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1355		emit_add(pc, temp, src[1][3], temp);
1356		for (c = 0; c < 4; c++) {
1357			if (!(mask & (1 << c)))
1358				continue;
1359			emit_mov(pc, dst[c], temp);
1360		}
1361		break;
1362	case TGSI_OPCODE_DST:
1363	{
1364		struct nv50_reg *one = alloc_immd(pc, 1.0);
1365		if (mask & (1 << 0))
1366			emit_mov(pc, dst[0], one);
1367		if (mask & (1 << 1))
1368			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1369		if (mask & (1 << 2))
1370			emit_mov(pc, dst[2], src[0][2]);
1371		if (mask & (1 << 3))
1372			emit_mov(pc, dst[3], src[1][3]);
1373		FREE(one);
1374	}
1375		break;
1376	case TGSI_OPCODE_EX2:
1377		temp = temp_temp(pc);
1378		emit_preex2(pc, temp, src[0][0]);
1379		emit_flop(pc, 6, temp, temp);
1380		for (c = 0; c < 4; c++) {
1381			if (!(mask & (1 << c)))
1382				continue;
1383			emit_mov(pc, dst[c], temp);
1384		}
1385		break;
1386	case TGSI_OPCODE_FLR:
1387		for (c = 0; c < 4; c++) {
1388			if (!(mask & (1 << c)))
1389				continue;
1390			emit_flr(pc, dst[c], src[0][c]);
1391		}
1392		break;
1393	case TGSI_OPCODE_FRC:
1394		temp = temp_temp(pc);
1395		for (c = 0; c < 4; c++) {
1396			if (!(mask & (1 << c)))
1397				continue;
1398			emit_flr(pc, temp, src[0][c]);
1399			emit_sub(pc, dst[c], src[0][c], temp);
1400		}
1401		break;
1402	case TGSI_OPCODE_KIL:
1403		emit_kil(pc, src[0][0]);
1404		emit_kil(pc, src[0][1]);
1405		emit_kil(pc, src[0][2]);
1406		emit_kil(pc, src[0][3]);
1407		pc->p->cfg.fp.regs[2] |= 0x00100000;
1408		break;
1409	case TGSI_OPCODE_LIT:
1410		emit_lit(pc, &dst[0], mask, &src[0][0]);
1411		break;
1412	case TGSI_OPCODE_LG2:
1413		temp = temp_temp(pc);
1414		emit_flop(pc, 3, temp, src[0][0]);
1415		for (c = 0; c < 4; c++) {
1416			if (!(mask & (1 << c)))
1417				continue;
1418			emit_mov(pc, dst[c], temp);
1419		}
1420		break;
1421	case TGSI_OPCODE_LRP:
1422		temp = temp_temp(pc);
1423		for (c = 0; c < 4; c++) {
1424			if (!(mask & (1 << c)))
1425				continue;
1426			emit_sub(pc, temp, src[1][c], src[2][c]);
1427			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1428		}
1429		break;
1430	case TGSI_OPCODE_MAD:
1431		for (c = 0; c < 4; c++) {
1432			if (!(mask & (1 << c)))
1433				continue;
1434			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1435		}
1436		break;
1437	case TGSI_OPCODE_MAX:
1438		for (c = 0; c < 4; c++) {
1439			if (!(mask & (1 << c)))
1440				continue;
1441			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1442		}
1443		break;
1444	case TGSI_OPCODE_MIN:
1445		for (c = 0; c < 4; c++) {
1446			if (!(mask & (1 << c)))
1447				continue;
1448			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1449		}
1450		break;
1451	case TGSI_OPCODE_MOV:
1452		for (c = 0; c < 4; c++) {
1453			if (!(mask & (1 << c)))
1454				continue;
1455			emit_mov(pc, dst[c], src[0][c]);
1456		}
1457		break;
1458	case TGSI_OPCODE_MUL:
1459		for (c = 0; c < 4; c++) {
1460			if (!(mask & (1 << c)))
1461				continue;
1462			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1463		}
1464		break;
1465	case TGSI_OPCODE_POW:
1466		temp = temp_temp(pc);
1467		emit_pow(pc, temp, src[0][0], src[1][0]);
1468		for (c = 0; c < 4; c++) {
1469			if (!(mask & (1 << c)))
1470				continue;
1471			emit_mov(pc, dst[c], temp);
1472		}
1473		break;
1474	case TGSI_OPCODE_RCP:
1475		for (c = 3; c >= 0; c--) {
1476			if (!(mask & (1 << c)))
1477				continue;
1478			emit_flop(pc, 0, dst[c], src[0][0]);
1479		}
1480		break;
1481	case TGSI_OPCODE_RSQ:
1482		for (c = 3; c >= 0; c--) {
1483			if (!(mask & (1 << c)))
1484				continue;
1485			emit_flop(pc, 2, dst[c], src[0][0]);
1486		}
1487		break;
1488	case TGSI_OPCODE_SCS:
1489		temp = temp_temp(pc);
1490		emit_precossin(pc, temp, src[0][0]);
1491		if (mask & (1 << 0))
1492			emit_flop(pc, 5, dst[0], temp);
1493		if (mask & (1 << 1))
1494			emit_flop(pc, 4, dst[1], temp);
1495		if (mask & (1 << 2))
1496			emit_mov_immdval(pc, dst[2], 0.0);
1497		if (mask & (1 << 3))
1498			emit_mov_immdval(pc, dst[3], 1.0);
1499		break;
1500	case TGSI_OPCODE_SGE:
1501		for (c = 0; c < 4; c++) {
1502			if (!(mask & (1 << c)))
1503				continue;
1504			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1505		}
1506		break;
1507	case TGSI_OPCODE_SIN:
1508		temp = temp_temp(pc);
1509		emit_precossin(pc, temp, src[0][0]);
1510		emit_flop(pc, 4, temp, temp);
1511		for (c = 0; c < 4; c++) {
1512			if (!(mask & (1 << c)))
1513				continue;
1514			emit_mov(pc, dst[c], temp);
1515		}
1516		break;
1517	case TGSI_OPCODE_SLT:
1518		for (c = 0; c < 4; c++) {
1519			if (!(mask & (1 << c)))
1520				continue;
1521			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1522		}
1523		break;
1524	case TGSI_OPCODE_SUB:
1525		for (c = 0; c < 4; c++) {
1526			if (!(mask & (1 << c)))
1527				continue;
1528			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1529		}
1530		break;
1531	case TGSI_OPCODE_TEX:
1532		emit_tex(pc, dst, mask, src[0], unit,
1533			 inst->InstructionExtTexture.Texture, FALSE);
1534		break;
1535	case TGSI_OPCODE_TXP:
1536		emit_tex(pc, dst, mask, src[0], unit,
1537			 inst->InstructionExtTexture.Texture, TRUE);
1538		break;
1539	case TGSI_OPCODE_XPD:
1540		temp = temp_temp(pc);
1541		if (mask & (1 << 0)) {
1542			emit_mul(pc, temp, src[0][2], src[1][1]);
1543			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1544		}
1545		if (mask & (1 << 1)) {
1546			emit_mul(pc, temp, src[0][0], src[1][2]);
1547			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1548		}
1549		if (mask & (1 << 2)) {
1550			emit_mul(pc, temp, src[0][1], src[1][0]);
1551			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1552		}
1553		if (mask & (1 << 3))
1554			emit_mov_immdval(pc, dst[3], 1.0);
1555		break;
1556	case TGSI_OPCODE_END:
1557		break;
1558	default:
1559		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1560		return FALSE;
1561	}
1562
1563	if (sat) {
1564		for (c = 0; c < 4; c++) {
1565			struct nv50_program_exec *e;
1566
1567			if (!(mask & (1 << c)))
1568				continue;
1569			e = exec(pc);
1570
1571			e->inst[0] = 0xa0000000; /* cvt */
1572			set_long(pc, e);
1573			e->inst[1] |= (6 << 29); /* cvt */
1574			e->inst[1] |= 0x04000000; /* 32 bit */
1575			e->inst[1] |= (1 << 14); /* src .f32 */
1576			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1577			set_dst(pc, rdst[c], e);
1578			set_src_0(pc, dst[c], e);
1579			emit(pc, e);
1580		}
1581	} else if (assimilate) {
1582		for (c = 0; c < 4; c++)
1583			if (rdst[c])
1584				assimilate_temp(pc, rdst[c], dst[c]);
1585	}
1586
1587	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1588		for (c = 0; c < 4; c++) {
1589			if (!src[i][c])
1590				continue;
1591			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1592				FREE(src[i][c]);
1593			else
1594			if (src[i][c]->acc == pc->insn_cur)
1595				release_hw(pc, src[i][c]);
1596		}
1597	}
1598
1599	kill_temp_temp(pc);
1600	return TRUE;
1601}
1602
1603/* Adjust a bitmask that indicates what components of a source are used,
1604 * we use this in tx_prep so we only load interpolants that are needed.
1605 */
1606static void
1607insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
1608{
1609	const struct tgsi_instruction_ext_texture *tex;
1610
1611	switch (insn->Instruction.Opcode) {
1612	case TGSI_OPCODE_DP3:
1613		*mask = 0x7;
1614		break;
1615	case TGSI_OPCODE_DP4:
1616	case TGSI_OPCODE_DPH:
1617		*mask = 0xF;
1618		break;
1619	case TGSI_OPCODE_LIT:
1620		*mask = 0xB;
1621		break;
1622	case TGSI_OPCODE_RCP:
1623	case TGSI_OPCODE_RSQ:
1624		*mask = 0x1;
1625		break;
1626	case TGSI_OPCODE_TEX:
1627	case TGSI_OPCODE_TXP:
1628		assert(insn->Instruction.Extended);
1629		tex = &insn->InstructionExtTexture;
1630
1631		*mask = 0x7;
1632		if (tex->Texture == TGSI_TEXTURE_1D)
1633			*mask = 0x1;
1634		else
1635		if (tex->Texture == TGSI_TEXTURE_2D)
1636			*mask = 0x3;
1637
1638		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1639			*mask |= 0x8;
1640		break;
1641	default:
1642		break;
1643	}
1644}
1645
1646static void
1647prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1648		  unsigned *r_usage[2])
1649{
1650	const struct tgsi_full_instruction *insn;
1651	const struct tgsi_full_src_register *src;
1652	const struct tgsi_dst_register *dst;
1653
1654	unsigned i, c, k, n, mask, *acc_p;
1655
1656	insn = &tok->FullInstruction;
1657	dst = &insn->FullDstRegisters[0].DstRegister;
1658	mask = dst->WriteMask;
1659
1660	if (!r_usage[0])
1661		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1662	if (!r_usage[1])
1663		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1664
1665	if (dst->File == TGSI_FILE_TEMPORARY) {
1666		for (c = 0; c < 4; c++) {
1667			if (!(mask & (1 << c)))
1668				continue;
1669			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1670		}
1671	}
1672
1673	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1674		src = &insn->FullSrcRegisters[i];
1675
1676		switch (src->SrcRegister.File) {
1677		case TGSI_FILE_TEMPORARY:
1678			acc_p = r_usage[0];
1679			break;
1680		case TGSI_FILE_INPUT:
1681			acc_p = r_usage[1];
1682			break;
1683		default:
1684			continue;
1685		}
1686
1687		insn_adjust_mask(insn, &mask);
1688
1689		for (c = 0; c < 4; c++) {
1690			if (!(mask & (1 << c)))
1691				continue;
1692
1693			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1694			switch (k) {
1695			case TGSI_EXTSWIZZLE_X:
1696			case TGSI_EXTSWIZZLE_Y:
1697			case TGSI_EXTSWIZZLE_Z:
1698			case TGSI_EXTSWIZZLE_W:
1699				n = src->SrcRegister.Index * 4 + k;
1700				acc_p[n] = pc->insn_nr;
1701				break;
1702			default:
1703				break;
1704			}
1705		}
1706	}
1707}
1708
1709static unsigned
1710load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1711	       int *aid, int *p_oid)
1712{
1713	struct nv50_reg *iv;
1714	int oid, c, n;
1715	unsigned mask = 0;
1716
1717	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1718
1719	for (c = 0, n = i * 4; c < 4; c++, n++) {
1720		oid = (*p_oid)++;
1721		pc->attr[n].type = P_TEMP;
1722		pc->attr[n].index = i;
1723
1724		if (pc->attr[n].acc == acc[n])
1725			continue;
1726		mask |= (1 << c);
1727
1728		pc->attr[n].acc = acc[n];
1729		pc->attr[n].rhw = pc->attr[n].hw = -1;
1730		alloc_reg(pc, &pc->attr[n]);
1731
1732		pc->attr[n].rhw = (*aid)++;
1733		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1734
1735		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1736		(*mid)++;
1737		pc->p->cfg.fp.regs[1] += 0x00010001;
1738	}
1739
1740	return mask;
1741}
1742
1743static boolean
1744nv50_program_tx_prep(struct nv50_pc *pc)
1745{
1746	struct tgsi_parse_context p;
1747	boolean ret = FALSE;
1748	unsigned i, c;
1749	unsigned fcol, bcol, fcrd, depr;
1750
1751	/* count (centroid) perspective interpolations */
1752	unsigned centroid_loads = 0;
1753	unsigned perspect_loads = 0;
1754
1755	/* track register access for temps and attrs */
1756	unsigned *r_usage[2];
1757	r_usage[0] = NULL;
1758	r_usage[1] = NULL;
1759
1760	depr = fcol = bcol = fcrd = 0xffff;
1761
1762	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1763		pc->p->cfg.fp.regs[0] = 0x01000404;
1764		pc->p->cfg.fp.regs[1] = 0x00000400;
1765	}
1766
1767	tgsi_parse_init(&p, pc->p->pipe.tokens);
1768	while (!tgsi_parse_end_of_tokens(&p)) {
1769		const union tgsi_full_token *tok = &p.FullToken;
1770
1771		tgsi_parse_token(&p);
1772		switch (tok->Token.Type) {
1773		case TGSI_TOKEN_TYPE_IMMEDIATE:
1774		{
1775			const struct tgsi_full_immediate *imm =
1776				&p.FullToken.FullImmediate;
1777
1778			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1779				      imm->u.ImmediateFloat32[1].Float,
1780				      imm->u.ImmediateFloat32[2].Float,
1781				      imm->u.ImmediateFloat32[3].Float);
1782		}
1783			break;
1784		case TGSI_TOKEN_TYPE_DECLARATION:
1785		{
1786			const struct tgsi_full_declaration *d;
1787			unsigned last, first, mode;
1788
1789			d = &p.FullToken.FullDeclaration;
1790			first = d->DeclarationRange.First;
1791			last = d->DeclarationRange.Last;
1792
1793			switch (d->Declaration.File) {
1794			case TGSI_FILE_TEMPORARY:
1795				if (pc->temp_nr < (last + 1))
1796					pc->temp_nr = last + 1;
1797				break;
1798			case TGSI_FILE_OUTPUT:
1799				if (pc->result_nr < (last + 1))
1800					pc->result_nr = last + 1;
1801
1802				if (!d->Declaration.Semantic)
1803					break;
1804
1805				switch (d->Semantic.SemanticName) {
1806				case TGSI_SEMANTIC_POSITION:
1807					depr = first;
1808					pc->p->cfg.fp.regs[2] |= 0x00000100;
1809					pc->p->cfg.fp.regs[3] |= 0x00000011;
1810					break;
1811				default:
1812					break;
1813				}
1814
1815				break;
1816			case TGSI_FILE_INPUT:
1817			{
1818				if (pc->attr_nr < (last + 1))
1819					pc->attr_nr = last + 1;
1820
1821				if (pc->p->type != PIPE_SHADER_FRAGMENT)
1822					break;
1823
1824				switch (d->Declaration.Interpolate) {
1825				case TGSI_INTERPOLATE_CONSTANT:
1826					mode = INTERP_FLAT;
1827					break;
1828				case TGSI_INTERPOLATE_PERSPECTIVE:
1829					mode = INTERP_PERSPECTIVE;
1830					break;
1831				default:
1832					mode = INTERP_LINEAR;
1833					break;
1834				}
1835
1836				if (d->Declaration.Semantic) {
1837					switch (d->Semantic.SemanticName) {
1838					case TGSI_SEMANTIC_POSITION:
1839						fcrd = first;
1840						break;
1841					case TGSI_SEMANTIC_COLOR:
1842						fcol = first;
1843						mode = INTERP_PERSPECTIVE;
1844						break;
1845					case TGSI_SEMANTIC_BCOLOR:
1846						bcol = first;
1847						mode = INTERP_PERSPECTIVE;
1848						break;
1849					}
1850				}
1851
1852				if (d->Declaration.Centroid) {
1853					mode |= INTERP_CENTROID;
1854					if (mode & INTERP_PERSPECTIVE)
1855						centroid_loads++;
1856				} else
1857				if (mode & INTERP_PERSPECTIVE)
1858					perspect_loads++;
1859
1860				assert(last < 32);
1861				for (i = first; i <= last; i++)
1862					pc->interp_mode[i] = mode;
1863			}
1864				break;
1865			case TGSI_FILE_CONSTANT:
1866				if (pc->param_nr < (last + 1))
1867					pc->param_nr = last + 1;
1868				break;
1869			case TGSI_FILE_SAMPLER:
1870				break;
1871			default:
1872				NOUVEAU_ERR("bad decl file %d\n",
1873					    d->Declaration.File);
1874				goto out_err;
1875			}
1876		}
1877			break;
1878		case TGSI_TOKEN_TYPE_INSTRUCTION:
1879			pc->insn_nr++;
1880			prep_inspect_insn(pc, tok, r_usage);
1881			break;
1882		default:
1883			break;
1884		}
1885	}
1886
1887	if (pc->temp_nr) {
1888		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1889		if (!pc->temp)
1890			goto out_err;
1891
1892		for (i = 0; i < pc->temp_nr; i++) {
1893			for (c = 0; c < 4; c++) {
1894				pc->temp[i*4+c].type = P_TEMP;
1895				pc->temp[i*4+c].hw = -1;
1896				pc->temp[i*4+c].rhw = -1;
1897				pc->temp[i*4+c].index = i;
1898				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
1899			}
1900		}
1901	}
1902
1903	if (pc->attr_nr) {
1904		int oid = 4, mid = 4, aid = 0;
1905		/* oid = VP output id
1906		 * aid = FP attribute/interpolant id
1907		 * mid = VP output mapping field ID
1908		 */
1909
1910		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1911		if (!pc->attr)
1912			goto out_err;
1913
1914		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1915			/* position should be loaded first */
1916			if (fcrd != 0xffff) {
1917				unsigned mask;
1918				mid = 0;
1919				mask = load_fp_attrib(pc, fcrd, r_usage[1],
1920						      &mid, &aid, &oid);
1921				oid = 0;
1922				pc->p->cfg.fp.regs[1] |= (mask << 24);
1923				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
1924			}
1925			pc->p->cfg.fp.map[0] += 0x03020100;
1926
1927			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
1928
1929			if (perspect_loads) {
1930				pc->iv_p = alloc_temp(pc, NULL);
1931
1932				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
1933					pc->p->cfg.fp.regs[1] |= 0x08000000;
1934					pc->iv_p->rhw = aid++;
1935					emit_interp(pc, pc->iv_p, NULL,
1936						    INTERP_LINEAR);
1937					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
1938				} else {
1939					pc->iv_p->rhw = aid - 1;
1940					emit_flop(pc, 0, pc->iv_p,
1941						  &pc->attr[fcrd * 4 + 3]);
1942				}
1943			}
1944
1945			if (centroid_loads) {
1946				pc->iv_c = alloc_temp(pc, NULL);
1947				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
1948				emit_interp(pc, pc->iv_c, NULL,
1949					    INTERP_CENTROID);
1950				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
1951				pc->p->cfg.fp.regs[1] |= 0x08000000;
1952			}
1953
1954			for (c = 0; c < 4; c++) {
1955				/* I don't know what these values do, but
1956				 * let's set them like the blob does:
1957				 */
1958				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
1959					pc->p->cfg.fp.regs[0] += 0x00010000;
1960				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
1961					pc->p->cfg.fp.regs[0] += 0x00010000;
1962			}
1963
1964			for (i = 0; i < pc->attr_nr; i++)
1965				load_fp_attrib(pc, i, r_usage[1],
1966					       &mid, &aid, &oid);
1967
1968			if (pc->iv_p)
1969				free_temp(pc, pc->iv_p);
1970			if (pc->iv_c)
1971				free_temp(pc, pc->iv_c);
1972
1973			pc->p->cfg.fp.high_map = (mid / 4);
1974			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
1975		} else {
1976			/* vertex program */
1977			for (i = 0; i < pc->attr_nr * 4; i++) {
1978				pc->p->cfg.vp.attr[aid / 32] |=
1979					(1 << (aid % 32));
1980				pc->attr[i].type = P_ATTR;
1981				pc->attr[i].hw = aid++;
1982				pc->attr[i].index = i / 4;
1983			}
1984		}
1985	}
1986
1987	if (pc->result_nr) {
1988		int rid = 0;
1989
1990		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
1991		if (!pc->result)
1992			goto out_err;
1993
1994		for (i = 0; i < pc->result_nr; i++) {
1995			for (c = 0; c < 4; c++) {
1996				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1997					pc->result[i*4+c].type = P_TEMP;
1998					pc->result[i*4+c].hw = -1;
1999					pc->result[i*4+c].rhw = (i == depr) ?
2000						-1 : rid++;
2001				} else {
2002					pc->result[i*4+c].type = P_RESULT;
2003					pc->result[i*4+c].hw = rid++;
2004				}
2005				pc->result[i*4+c].index = i;
2006			}
2007
2008			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2009			    depr != 0xffff) {
2010				pc->result[depr * 4 + 2].rhw =
2011					(pc->result_nr - 1) * 4;
2012			}
2013		}
2014	}
2015
2016	if (pc->param_nr) {
2017		int rid = 0;
2018
2019		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2020		if (!pc->param)
2021			goto out_err;
2022
2023		for (i = 0; i < pc->param_nr; i++) {
2024			for (c = 0; c < 4; c++) {
2025				pc->param[i*4+c].type = P_CONST;
2026				pc->param[i*4+c].hw = rid++;
2027				pc->param[i*4+c].index = i;
2028			}
2029		}
2030	}
2031
2032	if (pc->immd_nr) {
2033		int rid = 0;
2034
2035		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2036		if (!pc->immd)
2037			goto out_err;
2038
2039		for (i = 0; i < pc->immd_nr; i++) {
2040			for (c = 0; c < 4; c++) {
2041				pc->immd[i*4+c].type = P_IMMD;
2042				pc->immd[i*4+c].hw = rid++;
2043				pc->immd[i*4+c].index = i;
2044			}
2045		}
2046	}
2047
2048	ret = TRUE;
2049out_err:
2050	if (r_usage[0])
2051		FREE(r_usage[0]);
2052	if (r_usage[1])
2053		FREE(r_usage[1]);
2054
2055	tgsi_parse_free(&p);
2056	return ret;
2057}
2058
2059static void
2060free_nv50_pc(struct nv50_pc *pc)
2061{
2062	if (pc->immd)
2063		FREE(pc->immd);
2064	if (pc->param)
2065		FREE(pc->param);
2066	if (pc->result)
2067		FREE(pc->result);
2068	if (pc->attr)
2069		FREE(pc->attr);
2070	if (pc->temp)
2071		FREE(pc->temp);
2072
2073	FREE(pc);
2074}
2075
2076static boolean
2077nv50_program_tx(struct nv50_program *p)
2078{
2079	struct tgsi_parse_context parse;
2080	struct nv50_pc *pc;
2081	unsigned k;
2082	boolean ret;
2083
2084	pc = CALLOC_STRUCT(nv50_pc);
2085	if (!pc)
2086		return FALSE;
2087	pc->p = p;
2088	pc->p->cfg.high_temp = 4;
2089
2090	ret = nv50_program_tx_prep(pc);
2091	if (ret == FALSE)
2092		goto out_cleanup;
2093
2094	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2095	while (!tgsi_parse_end_of_tokens(&parse)) {
2096		const union tgsi_full_token *tok = &parse.FullToken;
2097
2098		/* don't allow half insn/immd on first and last instruction */
2099		pc->allow32 = TRUE;
2100		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2101			pc->allow32 = FALSE;
2102
2103		tgsi_parse_token(&parse);
2104
2105		switch (tok->Token.Type) {
2106		case TGSI_TOKEN_TYPE_INSTRUCTION:
2107			++pc->insn_cur;
2108			ret = nv50_program_tx_insn(pc, tok);
2109			if (ret == FALSE)
2110				goto out_err;
2111			break;
2112		default:
2113			break;
2114		}
2115	}
2116
2117	if (p->type == PIPE_SHADER_FRAGMENT) {
2118		struct nv50_reg out;
2119
2120		out.type = P_TEMP;
2121		for (k = 0; k < pc->result_nr * 4; k++) {
2122			if (pc->result[k].rhw == -1)
2123				continue;
2124			if (pc->result[k].hw != pc->result[k].rhw) {
2125				out.hw = pc->result[k].rhw;
2126				emit_mov(pc, &out, &pc->result[k]);
2127			}
2128			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2129				pc->p->cfg.high_result = pc->result[k].rhw + 1;
2130		}
2131	}
2132
2133	/* look for single half instructions and make them long */
2134	struct nv50_program_exec *e, *e_prev;
2135
2136	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2137		if (!is_long(e))
2138			k++;
2139
2140		if (!e->next || is_long(e->next)) {
2141			if (k & 1)
2142				convert_to_long(pc, e);
2143			k = 0;
2144		}
2145
2146		if (e->next)
2147			e_prev = e;
2148	}
2149
2150	if (!is_long(pc->p->exec_tail)) {
2151		/* this may occur if moving FP results */
2152		assert(e_prev && !is_long(e_prev));
2153		convert_to_long(pc, e_prev);
2154		convert_to_long(pc, pc->p->exec_tail);
2155	}
2156
2157	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2158	pc->p->exec_tail->inst[1] |= 0x00000001;
2159
2160	p->param_nr = pc->param_nr * 4;
2161	p->immd_nr = pc->immd_nr * 4;
2162	p->immd = pc->immd_buf;
2163
2164out_err:
2165	tgsi_parse_free(&parse);
2166
2167out_cleanup:
2168	free_nv50_pc(pc);
2169	return ret;
2170}
2171
2172static void
2173nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2174{
2175	if (nv50_program_tx(p) == FALSE)
2176		assert(0);
2177	p->translated = TRUE;
2178}
2179
2180static void
2181nv50_program_upload_data(struct nv50_context *nv50, float *map,
2182			unsigned start, unsigned count, unsigned cbuf)
2183{
2184	struct nouveau_channel *chan = nv50->screen->nvws->channel;
2185	struct nouveau_grobj *tesla = nv50->screen->tesla;
2186
2187	while (count) {
2188		unsigned nr = count > 2047 ? 2047 : count;
2189
2190		BEGIN_RING(chan, tesla, 0x00000f00, 1);
2191		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2192		BEGIN_RING(chan, tesla, 0x40000f04, nr);
2193		OUT_RINGp (chan, map, nr);
2194
2195		map += nr;
2196		start += nr;
2197		count -= nr;
2198	}
2199}
2200
2201static void
2202nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2203{
2204	struct nouveau_winsys *nvws = nv50->screen->nvws;
2205	struct pipe_winsys *ws = nv50->pipe.winsys;
2206
2207	if (!p->data[0] && p->immd_nr) {
2208		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2209
2210		if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) {
2211			while (heap->next && heap->size < p->immd_nr) {
2212				struct nv50_program *evict = heap->next->priv;
2213				nvws->res_free(&evict->data[0]);
2214			}
2215
2216			if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0]))
2217				assert(0);
2218		}
2219
2220		/* immediates only need to be uploaded again when freed */
2221		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2222					 p->immd_nr, NV50_CB_PMISC);
2223	}
2224
2225	if (!p->data[1] && p->param_nr) {
2226		struct nouveau_resource *heap =
2227			nv50->screen->parm_heap[p->type];
2228
2229		if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) {
2230			while (heap->next && heap->size < p->param_nr) {
2231				struct nv50_program *evict = heap->next->priv;
2232				nvws->res_free(&evict->data[1]);
2233			}
2234
2235			if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1]))
2236				assert(0);
2237		}
2238	}
2239
2240	if (p->param_nr) {
2241		unsigned cbuf = NV50_CB_PVP;
2242		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
2243					    PIPE_BUFFER_USAGE_CPU_READ);
2244		if (p->type == PIPE_SHADER_FRAGMENT)
2245			cbuf = NV50_CB_PFP;
2246		nv50_program_upload_data(nv50, map, p->data[1]->start,
2247					 p->param_nr, cbuf);
2248		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
2249	}
2250}
2251
2252static void
2253nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2254{
2255	struct nouveau_channel *chan = nv50->screen->nvws->channel;
2256	struct nouveau_grobj *tesla = nv50->screen->tesla;
2257	struct pipe_screen *screen = nv50->pipe.screen;
2258	struct nv50_program_exec *e;
2259	struct nouveau_stateobj *so;
2260	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2261	unsigned start, count, *up, *ptr;
2262	boolean upload = FALSE;
2263
2264	if (!p->buffer) {
2265		p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
2266		upload = TRUE;
2267	}
2268
2269	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2270		(p->data[1] && p->data[1]->start != p->data_start[1])) {
2271		for (e = p->exec_head; e; e = e->next) {
2272			unsigned ei, ci, bs;
2273
2274			if (e->param.index < 0)
2275				continue;
2276			bs = (e->inst[1] >> 22) & 0x07;
2277			assert(bs < 2);
2278			ei = e->param.shift >> 5;
2279			ci = e->param.index + p->data[bs]->start;
2280
2281			e->inst[ei] &= ~e->param.mask;
2282			e->inst[ei] |= (ci << e->param.shift);
2283		}
2284
2285		if (p->data[0])
2286			p->data_start[0] = p->data[0]->start;
2287		if (p->data[1])
2288			p->data_start[1] = p->data[1]->start;
2289
2290		upload = TRUE;
2291	}
2292
2293	if (!upload)
2294		return;
2295
2296#ifdef NV50_PROGRAM_DUMP
2297	NOUVEAU_ERR("-------\n");
2298	for (e = p->exec_head; e; e = e->next) {
2299		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2300		if (is_long(e))
2301			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2302	}
2303#endif
2304
2305	up = ptr = MALLOC(p->exec_size * 4);
2306	for (e = p->exec_head; e; e = e->next) {
2307		*(ptr++) = e->inst[0];
2308		if (is_long(e))
2309			*(ptr++) = e->inst[1];
2310	}
2311
2312	so = so_new(4,2);
2313	so_method(so, nv50->screen->tesla, 0x1280, 3);
2314	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2315	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2316	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2317
2318	start = 0; count = p->exec_size;
2319	while (count) {
2320		struct nouveau_winsys *nvws = nv50->screen->nvws;
2321		unsigned nr;
2322
2323		so_emit(nvws, so);
2324
2325		nr = MIN2(count, 2047);
2326		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
2327		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
2328			FIRE_RING(chan);
2329			continue;
2330		}
2331
2332		BEGIN_RING(chan, tesla, 0x0f00, 1);
2333		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2334		BEGIN_RING(chan, tesla, 0x40000f04, nr);
2335		OUT_RINGp (chan, up + start, nr);
2336
2337		start += nr;
2338		count -= nr;
2339	}
2340
2341	FREE(up);
2342	so_ref(NULL, &so);
2343}
2344
2345void
2346nv50_vertprog_validate(struct nv50_context *nv50)
2347{
2348	struct nouveau_grobj *tesla = nv50->screen->tesla;
2349	struct nv50_program *p = nv50->vertprog;
2350	struct nouveau_stateobj *so;
2351
2352	if (!p->translated) {
2353		nv50_program_validate(nv50, p);
2354		if (!p->translated)
2355			assert(0);
2356	}
2357
2358	nv50_program_validate_data(nv50, p);
2359	nv50_program_validate_code(nv50, p);
2360
2361	so = so_new(13, 2);
2362	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2363	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2364		  NOUVEAU_BO_HIGH, 0, 0);
2365	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2366		  NOUVEAU_BO_LOW, 0, 0);
2367	so_method(so, tesla, 0x1650, 2);
2368	so_data  (so, p->cfg.vp.attr[0]);
2369	so_data  (so, p->cfg.vp.attr[1]);
2370	so_method(so, tesla, 0x16b8, 1);
2371	so_data  (so, p->cfg.high_result);
2372	so_method(so, tesla, 0x16ac, 2);
2373	so_data  (so, p->cfg.high_result); //8);
2374	so_data  (so, p->cfg.high_temp);
2375	so_method(so, tesla, 0x140c, 1);
2376	so_data  (so, 0); /* program start offset */
2377	so_ref(so, &nv50->state.vertprog);
2378	so_ref(NULL, &so);
2379}
2380
2381void
2382nv50_fragprog_validate(struct nv50_context *nv50)
2383{
2384	struct nouveau_grobj *tesla = nv50->screen->tesla;
2385	struct nv50_program *p = nv50->fragprog;
2386	struct nouveau_stateobj *so;
2387	unsigned i;
2388
2389	if (!p->translated) {
2390		nv50_program_validate(nv50, p);
2391		if (!p->translated)
2392			assert(0);
2393	}
2394
2395	nv50_program_validate_data(nv50, p);
2396	nv50_program_validate_code(nv50, p);
2397
2398	so = so_new(64, 2);
2399	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2400	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2401		  NOUVEAU_BO_HIGH, 0, 0);
2402	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2403		  NOUVEAU_BO_LOW, 0, 0);
2404	so_method(so, tesla, 0x1904, 4);
2405	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2406	so_data  (so, 0x00000004);
2407	so_data  (so, 0x00000000);
2408	so_data  (so, 0x00000000);
2409	so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
2410	for (i = 0; i < p->cfg.fp.high_map; i++)
2411		so_data(so, p->cfg.fp.map[i]);
2412	so_method(so, tesla, 0x1988, 2);
2413	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2414	so_data  (so, p->cfg.high_temp);
2415	so_method(so, tesla, 0x1298, 1);
2416	so_data  (so, p->cfg.high_result);
2417	so_method(so, tesla, 0x19a8, 1);
2418	so_data  (so, p->cfg.fp.regs[2]);
2419	so_method(so, tesla, 0x196c, 1);
2420	so_data  (so, p->cfg.fp.regs[3]);
2421	so_method(so, tesla, 0x1414, 1);
2422	so_data  (so, 0); /* program start offset */
2423	so_ref(so, &nv50->state.fragprog);
2424	so_ref(NULL, &so);
2425}
2426
2427void
2428nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2429{
2430	struct pipe_screen *pscreen = nv50->pipe.screen;
2431
2432	while (p->exec_head) {
2433		struct nv50_program_exec *e = p->exec_head;
2434
2435		p->exec_head = e->next;
2436		FREE(e);
2437	}
2438	p->exec_tail = NULL;
2439	p->exec_size = 0;
2440
2441	if (p->buffer)
2442		pipe_buffer_reference(&p->buffer, NULL);
2443
2444	nv50->screen->nvws->res_free(&p->data[0]);
2445	nv50->screen->nvws->res_free(&p->data[1]);
2446
2447	p->translated = 0;
2448}
2449
2450