nv50_program.c revision fda01b584715c05696a0e6768fda669ef1eb5f3b
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88};
89
90struct nv50_pc {
91	struct nv50_program *p;
92
93	/* hw resources */
94	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
95
96	/* tgsi resources */
97	struct nv50_reg *temp;
98	int temp_nr;
99	struct nv50_reg *attr;
100	int attr_nr;
101	struct nv50_reg *result;
102	int result_nr;
103	struct nv50_reg *param;
104	int param_nr;
105	struct nv50_reg *immd;
106	float *immd_buf;
107	int immd_nr;
108
109	struct nv50_reg *temp_temp[16];
110	unsigned temp_temp_nr;
111};
112
113static void
114alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
115{
116	int i;
117
118	if (reg->type == P_RESULT) {
119		if (pc->p->cfg.high_result < (reg->hw + 1))
120			pc->p->cfg.high_result = reg->hw + 1;
121	}
122
123	if (reg->type != P_TEMP)
124		return;
125
126	if (reg->hw >= 0) {
127		/*XXX: do this here too to catch FP temp-as-attr usage..
128		 *     not clean, but works */
129		if (pc->p->cfg.high_temp < (reg->hw + 1))
130			pc->p->cfg.high_temp = reg->hw + 1;
131		return;
132	}
133
134	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
135		if (!(pc->r_temp[i])) {
136			pc->r_temp[i] = reg;
137			reg->hw = i;
138			if (pc->p->cfg.high_temp < (i + 1))
139				pc->p->cfg.high_temp = i + 1;
140			return;
141		}
142	}
143
144	assert(0);
145}
146
147static struct nv50_reg *
148alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
149{
150	struct nv50_reg *r;
151	int i;
152
153	if (dst && dst->type == P_TEMP && dst->hw == -1)
154		return dst;
155
156	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
157		if (!pc->r_temp[i]) {
158			r = CALLOC_STRUCT(nv50_reg);
159			r->type = P_TEMP;
160			r->index = -1;
161			r->hw = i;
162			pc->r_temp[i] = r;
163			return r;
164		}
165	}
166
167	assert(0);
168	return NULL;
169}
170
171static void
172free_temp(struct nv50_pc *pc, struct nv50_reg *r)
173{
174	if (r->index == -1) {
175		unsigned hw = r->hw;
176
177		FREE(pc->r_temp[hw]);
178		pc->r_temp[hw] = NULL;
179	}
180}
181
182static struct nv50_reg *
183temp_temp(struct nv50_pc *pc)
184{
185	if (pc->temp_temp_nr >= 16)
186		assert(0);
187
188	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
189	return pc->temp_temp[pc->temp_temp_nr++];
190}
191
192static void
193kill_temp_temp(struct nv50_pc *pc)
194{
195	int i;
196
197	for (i = 0; i < pc->temp_temp_nr; i++)
198		free_temp(pc, pc->temp_temp[i]);
199	pc->temp_temp_nr = 0;
200}
201
202static int
203ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
204{
205	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
206			       (pc->immd_nr + 1) * 4 * sizeof(float));
207	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
208	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
209	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
210	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
211
212	return pc->immd_nr++;
213}
214
215static struct nv50_reg *
216alloc_immd(struct nv50_pc *pc, float f)
217{
218	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
219	unsigned hw;
220
221	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
222	r->type = P_IMMD;
223	r->hw = hw;
224	r->index = -1;
225	return r;
226}
227
228static struct nv50_program_exec *
229exec(struct nv50_pc *pc)
230{
231	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
232
233	e->param.index = -1;
234	return e;
235}
236
237static void
238emit(struct nv50_pc *pc, struct nv50_program_exec *e)
239{
240	struct nv50_program *p = pc->p;
241
242	if (p->exec_tail)
243		p->exec_tail->next = e;
244	if (!p->exec_head)
245		p->exec_head = e;
246	p->exec_tail = e;
247	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
248}
249
250static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
251
252static boolean
253is_long(struct nv50_program_exec *e)
254{
255	if (e->inst[0] & 1)
256		return TRUE;
257	return FALSE;
258}
259
260static boolean
261is_immd(struct nv50_program_exec *e)
262{
263	if (is_long(e) && (e->inst[1] & 3) == 3)
264		return TRUE;
265	return FALSE;
266}
267
268static INLINE void
269set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
270	 struct nv50_program_exec *e)
271{
272	set_long(pc, e);
273	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
274	e->inst[1] |= (pred << 7) | (idx << 12);
275}
276
277static INLINE void
278set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
279	    struct nv50_program_exec *e)
280{
281	set_long(pc, e);
282	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
283	e->inst[1] |= (idx << 4) | (on << 6);
284}
285
286static INLINE void
287set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
288{
289	if (is_long(e))
290		return;
291
292	e->inst[0] |= 1;
293	set_pred(pc, 0xf, 0, e);
294	set_pred_wr(pc, 0, 0, e);
295}
296
297static INLINE void
298set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
299{
300	if (dst->type == P_RESULT) {
301		set_long(pc, e);
302		e->inst[1] |= 0x00000008;
303	}
304
305	alloc_reg(pc, dst);
306	e->inst[0] |= (dst->hw << 2);
307}
308
309static INLINE void
310set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
311{
312	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
313
314	set_long(pc, e);
315	/*XXX: can't be predicated - bits overlap.. catch cases where both
316	 *     are required and avoid them. */
317	set_pred(pc, 0, 0, e);
318	set_pred_wr(pc, 0, 0, e);
319
320	e->inst[1] |= 0x00000002 | 0x00000001;
321	e->inst[0] |= (val & 0x3f) << 16;
322	e->inst[1] |= (val >> 6) << 2;
323}
324
325static void
326emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
327	    struct nv50_reg *src, struct nv50_reg *iv)
328{
329	struct nv50_program_exec *e = exec(pc);
330
331	e->inst[0] |= 0x80000000;
332	set_dst(pc, dst, e);
333	alloc_reg(pc, src);
334	e->inst[0] |= (src->hw << 16);
335	if (iv) {
336		e->inst[0] |= (1 << 25);
337		alloc_reg(pc, iv);
338		e->inst[0] |= (iv->hw << 9);
339	}
340
341	emit(pc, e);
342}
343
344static void
345set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
346	 struct nv50_program_exec *e)
347{
348	set_long(pc, e);
349#if 1
350	e->inst[1] |= (1 << 22);
351#else
352	if (src->type == P_IMMD) {
353		e->inst[1] |= (NV50_CB_PMISC << 22);
354	} else {
355		if (pc->p->type == PIPE_SHADER_VERTEX)
356			e->inst[1] |= (NV50_CB_PVP << 22);
357		else
358			e->inst[1] |= (NV50_CB_PFP << 22);
359	}
360#endif
361
362	e->param.index = src->hw;
363	e->param.shift = s;
364	e->param.mask = m << (s % 32);
365}
366
367static void
368emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
369{
370	struct nv50_program_exec *e = exec(pc);
371
372	e->inst[0] |= 0x10000000;
373
374	set_dst(pc, dst, e);
375
376	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
377		set_immd(pc, src, e);
378		/*XXX: 32-bit, but steals part of "half" reg space - need to
379		 *     catch and handle this case if/when we do half-regs
380		 */
381		e->inst[0] |= 0x00008000;
382	} else
383	if (src->type == P_IMMD || src->type == P_CONST) {
384		set_long(pc, e);
385		set_data(pc, src, 0x7f, 9, e);
386		e->inst[1] |= 0x20000000; /* src0 const? */
387	} else {
388		if (src->type == P_ATTR) {
389			set_long(pc, e);
390			e->inst[1] |= 0x00200000;
391		}
392
393		alloc_reg(pc, src);
394		e->inst[0] |= (src->hw << 9);
395	}
396
397	/* We really should support "half" instructions here at some point,
398	 * but I don't feel confident enough about them yet.
399	 */
400	set_long(pc, e);
401	if (is_long(e) && !is_immd(e)) {
402		e->inst[1] |= 0x04000000; /* 32-bit */
403		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
404	}
405
406	emit(pc, e);
407}
408
409static boolean
410check_swap_src_0_1(struct nv50_pc *pc,
411		   struct nv50_reg **s0, struct nv50_reg **s1)
412{
413	struct nv50_reg *src0 = *s0, *src1 = *s1;
414
415	if (src0->type == P_CONST) {
416		if (src1->type != P_CONST) {
417			*s0 = src1;
418			*s1 = src0;
419			return TRUE;
420		}
421	} else
422	if (src1->type == P_ATTR) {
423		if (src0->type != P_ATTR) {
424			*s0 = src1;
425			*s1 = src0;
426			return TRUE;
427		}
428	}
429
430	return FALSE;
431}
432
433static void
434set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
435{
436	if (src->type == P_ATTR) {
437		set_long(pc, e);
438		e->inst[1] |= 0x00200000;
439	} else
440	if (src->type == P_CONST || src->type == P_IMMD) {
441		struct nv50_reg *temp = temp_temp(pc);
442
443		emit_mov(pc, temp, src);
444		src = temp;
445	}
446
447	alloc_reg(pc, src);
448	e->inst[0] |= (src->hw << 9);
449}
450
451static void
452set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
453{
454	if (src->type == P_ATTR) {
455		struct nv50_reg *temp = temp_temp(pc);
456
457		emit_mov(pc, temp, src);
458		src = temp;
459	} else
460	if (src->type == P_CONST || src->type == P_IMMD) {
461		assert(!(e->inst[0] & 0x00800000));
462		if (e->inst[0] & 0x01000000) {
463			struct nv50_reg *temp = temp_temp(pc);
464
465			emit_mov(pc, temp, src);
466			src = temp;
467		} else {
468			set_data(pc, src, 0x7f, 16, e);
469			e->inst[0] |= 0x00800000;
470		}
471	}
472
473	alloc_reg(pc, src);
474	e->inst[0] |= (src->hw << 16);
475}
476
477static void
478set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
479{
480	set_long(pc, e);
481
482	if (src->type == P_ATTR) {
483		struct nv50_reg *temp = temp_temp(pc);
484
485		emit_mov(pc, temp, src);
486		src = temp;
487	} else
488	if (src->type == P_CONST || src->type == P_IMMD) {
489		assert(!(e->inst[0] & 0x01000000));
490		if (e->inst[0] & 0x00800000) {
491			struct nv50_reg *temp = temp_temp(pc);
492
493			emit_mov(pc, temp, src);
494			src = temp;
495		} else {
496			set_data(pc, src, 0x7f, 32+14, e);
497			e->inst[0] |= 0x01000000;
498		}
499	}
500
501	alloc_reg(pc, src);
502	e->inst[1] |= (src->hw << 14);
503}
504
505static void
506emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
507	 struct nv50_reg *src1)
508{
509	struct nv50_program_exec *e = exec(pc);
510
511	e->inst[0] |= 0xc0000000;
512	set_long(pc, e);
513
514	check_swap_src_0_1(pc, &src0, &src1);
515	set_dst(pc, dst, e);
516	set_src_0(pc, src0, e);
517	set_src_1(pc, src1, e);
518
519	emit(pc, e);
520}
521
522static void
523emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
524	 struct nv50_reg *src0, struct nv50_reg *src1)
525{
526	struct nv50_program_exec *e = exec(pc);
527
528	e->inst[0] |= 0xb0000000;
529
530	check_swap_src_0_1(pc, &src0, &src1);
531	set_dst(pc, dst, e);
532	set_src_0(pc, src0, e);
533	if (is_long(e))
534		set_src_2(pc, src1, e);
535	else
536		set_src_1(pc, src1, e);
537
538	emit(pc, e);
539}
540
541static void
542emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
543	    struct nv50_reg *src0, struct nv50_reg *src1)
544{
545	struct nv50_program_exec *e = exec(pc);
546
547	set_long(pc, e);
548	e->inst[0] |= 0xb0000000;
549	e->inst[1] |= (sub << 29);
550
551	check_swap_src_0_1(pc, &src0, &src1);
552	set_dst(pc, dst, e);
553	set_src_0(pc, src0, e);
554	set_src_1(pc, src1, e);
555
556	emit(pc, e);
557}
558
559static void
560emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
561	 struct nv50_reg *src1)
562{
563	struct nv50_program_exec *e = exec(pc);
564
565	e->inst[0] |= 0xb0000000;
566
567	set_long(pc, e);
568	if (check_swap_src_0_1(pc, &src0, &src1))
569		e->inst[1] |= 0x04000000;
570	else
571		e->inst[1] |= 0x08000000;
572
573	set_dst(pc, dst, e);
574	set_src_0(pc, src0, e);
575	set_src_2(pc, src1, e);
576
577	emit(pc, e);
578}
579
580static void
581emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
582	 struct nv50_reg *src1, struct nv50_reg *src2)
583{
584	struct nv50_program_exec *e = exec(pc);
585
586	e->inst[0] |= 0xe0000000;
587
588	check_swap_src_0_1(pc, &src0, &src1);
589	set_dst(pc, dst, e);
590	set_src_0(pc, src0, e);
591	set_src_1(pc, src1, e);
592	set_src_2(pc, src2, e);
593
594	emit(pc, e);
595}
596
597static void
598emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
599	 struct nv50_reg *src1, struct nv50_reg *src2)
600{
601	struct nv50_program_exec *e = exec(pc);
602
603	e->inst[0] |= 0xe0000000;
604	set_long(pc, e);
605	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
606
607	check_swap_src_0_1(pc, &src0, &src1);
608	set_dst(pc, dst, e);
609	set_src_0(pc, src0, e);
610	set_src_1(pc, src1, e);
611	set_src_2(pc, src2, e);
612
613	emit(pc, e);
614}
615
616static void
617emit_flop(struct nv50_pc *pc, unsigned sub,
618	  struct nv50_reg *dst, struct nv50_reg *src)
619{
620	struct nv50_program_exec *e = exec(pc);
621
622	e->inst[0] |= 0x90000000;
623	if (sub) {
624		set_long(pc, e);
625		e->inst[1] |= (sub << 29);
626	}
627
628	set_dst(pc, dst, e);
629	set_src_0(pc, src, e);
630
631	emit(pc, e);
632}
633
634static void
635emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
636{
637	struct nv50_program_exec *e = exec(pc);
638
639	e->inst[0] |= 0xb0000000;
640
641	set_dst(pc, dst, e);
642	set_src_0(pc, src, e);
643	set_long(pc, e);
644	e->inst[1] |= (6 << 29) | 0x00004000;
645
646	emit(pc, e);
647}
648
649static void
650emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
651{
652	struct nv50_program_exec *e = exec(pc);
653
654	e->inst[0] |= 0xb0000000;
655
656	set_dst(pc, dst, e);
657	set_src_0(pc, src, e);
658	set_long(pc, e);
659	e->inst[1] |= (6 << 29);
660
661	emit(pc, e);
662}
663
664static void
665emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
666	 struct nv50_reg *src0, struct nv50_reg *src1)
667{
668	struct nv50_program_exec *e = exec(pc);
669	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
670	struct nv50_reg *rdst;
671
672	assert(c_op <= 7);
673	if (check_swap_src_0_1(pc, &src0, &src1))
674		c_op = inv_cop[c_op];
675
676	rdst = dst;
677	if (dst->type != P_TEMP)
678		dst = alloc_temp(pc, NULL);
679
680	/* set.u32 */
681	set_long(pc, e);
682	e->inst[0] |= 0xb0000000;
683	e->inst[1] |= (3 << 29);
684	e->inst[1] |= (c_op << 14);
685	/*XXX: breaks things, .u32 by default?
686	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
687	 *     doesn't seem to match what the hw actually does.
688	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
689	 */
690	set_dst(pc, dst, e);
691	set_src_0(pc, src0, e);
692	set_src_1(pc, src1, e);
693	emit(pc, e);
694
695	/* cvt.f32.u32 */
696	e = exec(pc);
697	e->inst[0] = 0xa0000001;
698	e->inst[1] = 0x64014780;
699	set_dst(pc, rdst, e);
700	set_src_0(pc, dst, e);
701	emit(pc, e);
702
703	if (dst != rdst)
704		free_temp(pc, dst);
705}
706
707static void
708emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
709{
710	struct nv50_program_exec *e = exec(pc);
711
712	e->inst[0] = 0xa0000000; /* cvt */
713	set_long(pc, e);
714	e->inst[1] |= (6 << 29); /* cvt */
715	e->inst[1] |= 0x08000000; /* integer mode */
716	e->inst[1] |= 0x04000000; /* 32 bit */
717	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
718	e->inst[1] |= (1 << 14); /* src .f32 */
719	set_dst(pc, dst, e);
720	set_src_0(pc, src, e);
721
722	emit(pc, e);
723}
724
725static void
726emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
727	 struct nv50_reg *v, struct nv50_reg *e)
728{
729	struct nv50_reg *temp = alloc_temp(pc, NULL);
730
731	emit_flop(pc, 3, temp, v);
732	emit_mul(pc, temp, temp, e);
733	emit_preex2(pc, temp, temp);
734	emit_flop(pc, 6, dst, temp);
735
736	free_temp(pc, temp);
737}
738
739static void
740emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
741{
742	struct nv50_program_exec *e = exec(pc);
743
744	e->inst[0] = 0xa0000000; /* cvt */
745	set_long(pc, e);
746	e->inst[1] |= (6 << 29); /* cvt */
747	e->inst[1] |= 0x04000000; /* 32 bit */
748	e->inst[1] |= (1 << 14); /* src .f32 */
749	e->inst[1] |= ((1 << 6) << 14); /* .abs */
750	set_dst(pc, dst, e);
751	set_src_0(pc, src, e);
752
753	emit(pc, e);
754}
755
756static void
757emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
758	 struct nv50_reg **src)
759{
760	struct nv50_reg *one = alloc_immd(pc, 1.0);
761	struct nv50_reg *zero = alloc_immd(pc, 0.0);
762	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
763	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
764	struct nv50_reg *tmp[4];
765
766	if (mask & (1 << 0))
767		emit_mov(pc, dst[0], one);
768
769	if (mask & (1 << 3))
770		emit_mov(pc, dst[3], one);
771
772	if (mask & (3 << 1)) {
773		if (mask & (1 << 1))
774			tmp[0] = dst[1];
775		else
776			tmp[0] = temp_temp(pc);
777		emit_minmax(pc, 4, tmp[0], src[0], zero);
778	}
779
780	if (mask & (1 << 2)) {
781		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
782
783		tmp[1] = temp_temp(pc);
784		emit_minmax(pc, 4, tmp[1], src[1], zero);
785
786		tmp[3] = temp_temp(pc);
787		emit_minmax(pc, 4, tmp[3], src[3], neg128);
788		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
789
790		emit_pow(pc, dst[2], tmp[1], tmp[3]);
791		emit_mov(pc, dst[2], zero);
792		set_pred(pc, 3, 0, pc->p->exec_tail);
793	}
794}
795
796static void
797emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
798{
799	struct nv50_program_exec *e = exec(pc);
800
801	set_long(pc, e);
802	e->inst[0] |= 0xa0000000; /* delta */
803	e->inst[1] |= (7 << 29); /* delta */
804	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
805	e->inst[1] |= (1 << 14); /* src .f32 */
806	set_dst(pc, dst, e);
807	set_src_0(pc, src, e);
808
809	emit(pc, e);
810}
811
812static struct nv50_reg *
813tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
814{
815	switch (dst->DstRegister.File) {
816	case TGSI_FILE_TEMPORARY:
817		return &pc->temp[dst->DstRegister.Index * 4 + c];
818	case TGSI_FILE_OUTPUT:
819		return &pc->result[dst->DstRegister.Index * 4 + c];
820	case TGSI_FILE_NULL:
821		return NULL;
822	default:
823		break;
824	}
825
826	return NULL;
827}
828
829static struct nv50_reg *
830tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
831{
832	struct nv50_reg *r = NULL;
833	struct nv50_reg *temp;
834	unsigned c;
835
836	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
837	switch (c) {
838	case TGSI_EXTSWIZZLE_X:
839	case TGSI_EXTSWIZZLE_Y:
840	case TGSI_EXTSWIZZLE_Z:
841	case TGSI_EXTSWIZZLE_W:
842		switch (src->SrcRegister.File) {
843		case TGSI_FILE_INPUT:
844			r = &pc->attr[src->SrcRegister.Index * 4 + c];
845			break;
846		case TGSI_FILE_TEMPORARY:
847			r = &pc->temp[src->SrcRegister.Index * 4 + c];
848			break;
849		case TGSI_FILE_CONSTANT:
850			r = &pc->param[src->SrcRegister.Index * 4 + c];
851			break;
852		case TGSI_FILE_IMMEDIATE:
853			r = &pc->immd[src->SrcRegister.Index * 4 + c];
854			break;
855		case TGSI_FILE_SAMPLER:
856			break;
857		default:
858			assert(0);
859			break;
860		}
861		break;
862	case TGSI_EXTSWIZZLE_ZERO:
863		r = alloc_immd(pc, 0.0);
864		break;
865	case TGSI_EXTSWIZZLE_ONE:
866		r = alloc_immd(pc, 1.0);
867		break;
868	default:
869		assert(0);
870		break;
871	}
872
873	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
874	case TGSI_UTIL_SIGN_KEEP:
875		break;
876	case TGSI_UTIL_SIGN_CLEAR:
877		temp = temp_temp(pc);
878		emit_abs(pc, temp, r);
879		r = temp;
880		break;
881	case TGSI_UTIL_SIGN_TOGGLE:
882		temp = temp_temp(pc);
883		emit_neg(pc, temp, r);
884		r = temp;
885		break;
886	case TGSI_UTIL_SIGN_SET:
887		temp = temp_temp(pc);
888		emit_abs(pc, temp, r);
889		emit_neg(pc, temp, r);
890		r = temp;
891		break;
892	default:
893		assert(0);
894		break;
895	}
896
897	return r;
898}
899
900static boolean
901nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
902{
903	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
904	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
905	unsigned mask, sat;
906	int i, c;
907
908	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
909	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
910
911	for (c = 0; c < 4; c++) {
912		if (mask & (1 << c))
913			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
914		else
915			dst[c] = NULL;
916	}
917
918	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
919		for (c = 0; c < 4; c++)
920			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
921	}
922
923	if (sat) {
924		for (c = 0; c < 4; c++) {
925			rdst[c] = dst[c];
926			dst[c] = temp_temp(pc);
927		}
928	}
929
930	switch (inst->Instruction.Opcode) {
931	case TGSI_OPCODE_ABS:
932		for (c = 0; c < 4; c++) {
933			if (!(mask & (1 << c)))
934				continue;
935			emit_abs(pc, dst[c], src[0][c]);
936		}
937		break;
938	case TGSI_OPCODE_ADD:
939		for (c = 0; c < 4; c++) {
940			if (!(mask & (1 << c)))
941				continue;
942			emit_add(pc, dst[c], src[0][c], src[1][c]);
943		}
944		break;
945	case TGSI_OPCODE_COS:
946		temp = alloc_temp(pc, NULL);
947		emit_precossin(pc, temp, src[0][0]);
948		emit_flop(pc, 5, temp, temp);
949		for (c = 0; c < 4; c++) {
950			if (!(mask & (1 << c)))
951				continue;
952			emit_mov(pc, dst[c], temp);
953		}
954		break;
955	case TGSI_OPCODE_DP3:
956		temp = alloc_temp(pc, NULL);
957		emit_mul(pc, temp, src[0][0], src[1][0]);
958		emit_mad(pc, temp, src[0][1], src[1][1], temp);
959		emit_mad(pc, temp, src[0][2], src[1][2], temp);
960		for (c = 0; c < 4; c++) {
961			if (!(mask & (1 << c)))
962				continue;
963			emit_mov(pc, dst[c], temp);
964		}
965		free_temp(pc, temp);
966		break;
967	case TGSI_OPCODE_DP4:
968		temp = alloc_temp(pc, NULL);
969		emit_mul(pc, temp, src[0][0], src[1][0]);
970		emit_mad(pc, temp, src[0][1], src[1][1], temp);
971		emit_mad(pc, temp, src[0][2], src[1][2], temp);
972		emit_mad(pc, temp, src[0][3], src[1][3], temp);
973		for (c = 0; c < 4; c++) {
974			if (!(mask & (1 << c)))
975				continue;
976			emit_mov(pc, dst[c], temp);
977		}
978		free_temp(pc, temp);
979		break;
980	case TGSI_OPCODE_DPH:
981		temp = alloc_temp(pc, NULL);
982		emit_mul(pc, temp, src[0][0], src[1][0]);
983		emit_mad(pc, temp, src[0][1], src[1][1], temp);
984		emit_mad(pc, temp, src[0][2], src[1][2], temp);
985		emit_add(pc, temp, src[1][3], temp);
986		for (c = 0; c < 4; c++) {
987			if (!(mask & (1 << c)))
988				continue;
989			emit_mov(pc, dst[c], temp);
990		}
991		free_temp(pc, temp);
992		break;
993	case TGSI_OPCODE_DST:
994	{
995		struct nv50_reg *one = alloc_immd(pc, 1.0);
996		if (mask & (1 << 0))
997			emit_mov(pc, dst[0], one);
998		if (mask & (1 << 1))
999			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1000		if (mask & (1 << 2))
1001			emit_mov(pc, dst[2], src[0][2]);
1002		if (mask & (1 << 3))
1003			emit_mov(pc, dst[3], src[1][3]);
1004		FREE(one);
1005	}
1006		break;
1007	case TGSI_OPCODE_EX2:
1008		temp = alloc_temp(pc, NULL);
1009		emit_preex2(pc, temp, src[0][0]);
1010		emit_flop(pc, 6, temp, temp);
1011		for (c = 0; c < 4; c++) {
1012			if (!(mask & (1 << c)))
1013				continue;
1014			emit_mov(pc, dst[c], temp);
1015		}
1016		free_temp(pc, temp);
1017		break;
1018	case TGSI_OPCODE_FLR:
1019		for (c = 0; c < 4; c++) {
1020			if (!(mask & (1 << c)))
1021				continue;
1022			emit_flr(pc, dst[c], src[0][c]);
1023		}
1024		break;
1025	case TGSI_OPCODE_FRC:
1026		temp = alloc_temp(pc, NULL);
1027		for (c = 0; c < 4; c++) {
1028			if (!(mask & (1 << c)))
1029				continue;
1030			emit_flr(pc, temp, src[0][c]);
1031			emit_sub(pc, dst[c], src[0][c], temp);
1032		}
1033		free_temp(pc, temp);
1034		break;
1035	case TGSI_OPCODE_LIT:
1036		emit_lit(pc, &dst[0], mask, &src[0][0]);
1037		break;
1038	case TGSI_OPCODE_LG2:
1039		temp = alloc_temp(pc, NULL);
1040		emit_flop(pc, 3, temp, src[0][0]);
1041		for (c = 0; c < 4; c++) {
1042			if (!(mask & (1 << c)))
1043				continue;
1044			emit_mov(pc, dst[c], temp);
1045		}
1046		break;
1047	case TGSI_OPCODE_LRP:
1048		for (c = 0; c < 4; c++) {
1049			if (!(mask & (1 << c)))
1050				continue;
1051			/*XXX: we can do better than this */
1052			temp = alloc_temp(pc, NULL);
1053			emit_neg(pc, temp, src[0][c]);
1054			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
1055			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
1056			free_temp(pc, temp);
1057		}
1058		break;
1059	case TGSI_OPCODE_MAD:
1060		for (c = 0; c < 4; c++) {
1061			if (!(mask & (1 << c)))
1062				continue;
1063			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1064		}
1065		break;
1066	case TGSI_OPCODE_MAX:
1067		for (c = 0; c < 4; c++) {
1068			if (!(mask & (1 << c)))
1069				continue;
1070			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1071		}
1072		break;
1073	case TGSI_OPCODE_MIN:
1074		for (c = 0; c < 4; c++) {
1075			if (!(mask & (1 << c)))
1076				continue;
1077			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1078		}
1079		break;
1080	case TGSI_OPCODE_MOV:
1081		for (c = 0; c < 4; c++) {
1082			if (!(mask & (1 << c)))
1083				continue;
1084			emit_mov(pc, dst[c], src[0][c]);
1085		}
1086		break;
1087	case TGSI_OPCODE_MUL:
1088		for (c = 0; c < 4; c++) {
1089			if (!(mask & (1 << c)))
1090				continue;
1091			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1092		}
1093		break;
1094	case TGSI_OPCODE_POW:
1095		temp = alloc_temp(pc, NULL);
1096		emit_pow(pc, temp, src[0][0], src[1][0]);
1097		for (c = 0; c < 4; c++) {
1098			if (!(mask & (1 << c)))
1099				continue;
1100			emit_mov(pc, dst[c], temp);
1101		}
1102		free_temp(pc, temp);
1103		break;
1104	case TGSI_OPCODE_RCP:
1105		for (c = 0; c < 4; c++) {
1106			if (!(mask & (1 << c)))
1107				continue;
1108			emit_flop(pc, 0, dst[c], src[0][0]);
1109		}
1110		break;
1111	case TGSI_OPCODE_RSQ:
1112		for (c = 0; c < 4; c++) {
1113			if (!(mask & (1 << c)))
1114				continue;
1115			emit_flop(pc, 2, dst[c], src[0][0]);
1116		}
1117		break;
1118	case TGSI_OPCODE_SCS:
1119		temp = alloc_temp(pc, NULL);
1120		emit_precossin(pc, temp, src[0][0]);
1121		if (mask & (1 << 0))
1122			emit_flop(pc, 5, dst[0], temp);
1123		if (mask & (1 << 1))
1124			emit_flop(pc, 4, dst[1], temp);
1125		break;
1126	case TGSI_OPCODE_SGE:
1127		for (c = 0; c < 4; c++) {
1128			if (!(mask & (1 << c)))
1129				continue;
1130			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1131		}
1132		break;
1133	case TGSI_OPCODE_SIN:
1134		temp = alloc_temp(pc, NULL);
1135		emit_precossin(pc, temp, src[0][0]);
1136		emit_flop(pc, 4, temp, temp);
1137		for (c = 0; c < 4; c++) {
1138			if (!(mask & (1 << c)))
1139				continue;
1140			emit_mov(pc, dst[c], temp);
1141		}
1142		break;
1143	case TGSI_OPCODE_SLT:
1144		for (c = 0; c < 4; c++) {
1145			if (!(mask & (1 << c)))
1146				continue;
1147			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1148		}
1149		break;
1150	case TGSI_OPCODE_SUB:
1151		for (c = 0; c < 4; c++) {
1152			if (!(mask & (1 << c)))
1153				continue;
1154			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1155		}
1156		break;
1157	case TGSI_OPCODE_TEX:
1158		{
1159			struct nv50_reg *t0, *t1, *t2, *t3;
1160			struct nv50_program_exec *e;
1161
1162			t0 = alloc_temp(pc, NULL);
1163			t0 = alloc_temp(pc, NULL);
1164			t1 = alloc_temp(pc, NULL);
1165			t2 = alloc_temp(pc, NULL);
1166			t3 = alloc_temp(pc, NULL);
1167			emit_mov(pc, t0, src[0][0]);
1168			emit_mov(pc, t1, src[0][1]);
1169
1170			e = exec(pc);
1171			e->inst[0] = 0xf6400000;
1172			set_long(pc, e);
1173			e->inst[1] |= 0x0000c004;
1174			set_dst(pc, t0, e);
1175			emit(pc, e);
1176
1177			if (mask & (1 << 0)) emit_mov(pc, dst[0], t0);
1178			if (mask & (1 << 1)) emit_mov(pc, dst[1], t1);
1179			if (mask & (1 << 2)) emit_mov(pc, dst[2], t2);
1180			if (mask & (1 << 3)) emit_mov(pc, dst[3], t3);
1181
1182			free_temp(pc, t0);
1183			free_temp(pc, t1);
1184			free_temp(pc, t2);
1185			free_temp(pc, t3);
1186		}
1187		break;
1188	case TGSI_OPCODE_XPD:
1189		temp = alloc_temp(pc, NULL);
1190		if (mask & (1 << 0)) {
1191			emit_mul(pc, temp, src[0][2], src[1][1]);
1192			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1193		}
1194		if (mask & (1 << 1)) {
1195			emit_mul(pc, temp, src[0][0], src[1][2]);
1196			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1197		}
1198		if (mask & (1 << 2)) {
1199			emit_mul(pc, temp, src[0][1], src[1][0]);
1200			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1201		}
1202		free_temp(pc, temp);
1203		break;
1204	case TGSI_OPCODE_END:
1205		break;
1206	default:
1207		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1208		return FALSE;
1209	}
1210
1211	if (sat) {
1212		for (c = 0; c < 4; c++) {
1213			struct nv50_program_exec *e;
1214
1215			if (!(mask & (1 << c)))
1216				continue;
1217			e = exec(pc);
1218
1219			e->inst[0] = 0xa0000000; /* cvt */
1220			set_long(pc, e);
1221			e->inst[1] |= (6 << 29); /* cvt */
1222			e->inst[1] |= 0x04000000; /* 32 bit */
1223			e->inst[1] |= (1 << 14); /* src .f32 */
1224			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1225			set_dst(pc, rdst[c], e);
1226			set_src_0(pc, dst[c], e);
1227			emit(pc, e);
1228		}
1229	}
1230
1231	kill_temp_temp(pc);
1232	return TRUE;
1233}
1234
1235static boolean
1236nv50_program_tx_prep(struct nv50_pc *pc)
1237{
1238	struct tgsi_parse_context p;
1239	boolean ret = FALSE;
1240	unsigned i, c;
1241
1242	tgsi_parse_init(&p, pc->p->pipe.tokens);
1243	while (!tgsi_parse_end_of_tokens(&p)) {
1244		const union tgsi_full_token *tok = &p.FullToken;
1245
1246		tgsi_parse_token(&p);
1247		switch (tok->Token.Type) {
1248		case TGSI_TOKEN_TYPE_IMMEDIATE:
1249		{
1250			const struct tgsi_full_immediate *imm =
1251				&p.FullToken.FullImmediate;
1252
1253			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1254				      imm->u.ImmediateFloat32[1].Float,
1255				      imm->u.ImmediateFloat32[2].Float,
1256				      imm->u.ImmediateFloat32[3].Float);
1257		}
1258			break;
1259		case TGSI_TOKEN_TYPE_DECLARATION:
1260		{
1261			const struct tgsi_full_declaration *d;
1262			unsigned last;
1263
1264			d = &p.FullToken.FullDeclaration;
1265			last = d->DeclarationRange.Last;
1266
1267			switch (d->Declaration.File) {
1268			case TGSI_FILE_TEMPORARY:
1269				if (pc->temp_nr < (last + 1))
1270					pc->temp_nr = last + 1;
1271				break;
1272			case TGSI_FILE_OUTPUT:
1273				if (pc->result_nr < (last + 1))
1274					pc->result_nr = last + 1;
1275				break;
1276			case TGSI_FILE_INPUT:
1277				if (pc->attr_nr < (last + 1))
1278					pc->attr_nr = last + 1;
1279				break;
1280			case TGSI_FILE_CONSTANT:
1281				if (pc->param_nr < (last + 1))
1282					pc->param_nr = last + 1;
1283				break;
1284			case TGSI_FILE_SAMPLER:
1285				break;
1286			default:
1287				NOUVEAU_ERR("bad decl file %d\n",
1288					    d->Declaration.File);
1289				goto out_err;
1290			}
1291		}
1292			break;
1293		case TGSI_TOKEN_TYPE_INSTRUCTION:
1294			break;
1295		default:
1296			break;
1297		}
1298	}
1299
1300	if (pc->temp_nr) {
1301		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1302		if (!pc->temp)
1303			goto out_err;
1304
1305		for (i = 0; i < pc->temp_nr; i++) {
1306			for (c = 0; c < 4; c++) {
1307				pc->temp[i*4+c].type = P_TEMP;
1308				pc->temp[i*4+c].hw = -1;
1309				pc->temp[i*4+c].index = i;
1310			}
1311		}
1312	}
1313
1314	if (pc->attr_nr) {
1315		struct nv50_reg *iv = NULL;
1316		int aid = 0;
1317
1318		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1319		if (!pc->attr)
1320			goto out_err;
1321
1322		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1323			iv = alloc_temp(pc, NULL);
1324			emit_interp(pc, iv, iv, NULL);
1325			emit_flop(pc, 0, iv, iv);
1326			aid++;
1327		}
1328
1329		for (i = 0; i < pc->attr_nr; i++) {
1330			struct nv50_reg *a = &pc->attr[i*4];
1331
1332			for (c = 0; c < 4; c++) {
1333				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1334					struct nv50_reg *at =
1335						alloc_temp(pc, NULL);
1336					pc->attr[i*4+c].type = at->type;
1337					pc->attr[i*4+c].hw = at->hw;
1338					pc->attr[i*4+c].index = at->index;
1339				} else {
1340					pc->p->cfg.vp.attr[aid/32] |=
1341						(1 << (aid % 32));
1342					pc->attr[i*4+c].type = P_ATTR;
1343					pc->attr[i*4+c].hw = aid++;
1344					pc->attr[i*4+c].index = i;
1345				}
1346			}
1347
1348			if (pc->p->type != PIPE_SHADER_FRAGMENT)
1349				continue;
1350
1351			emit_interp(pc, &a[0], &a[0], iv);
1352			emit_interp(pc, &a[1], &a[1], iv);
1353			emit_interp(pc, &a[2], &a[2], iv);
1354			emit_interp(pc, &a[3], &a[3], iv);
1355		}
1356
1357		if (iv)
1358			free_temp(pc, iv);
1359	}
1360
1361	if (pc->result_nr) {
1362		int rid = 0;
1363
1364		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
1365		if (!pc->result)
1366			goto out_err;
1367
1368		for (i = 0; i < pc->result_nr; i++) {
1369			for (c = 0; c < 4; c++) {
1370				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1371					pc->result[i*4+c].type = P_TEMP;
1372					pc->result[i*4+c].hw = -1;
1373				} else {
1374					pc->result[i*4+c].type = P_RESULT;
1375					pc->result[i*4+c].hw = rid++;
1376				}
1377				pc->result[i*4+c].index = i;
1378			}
1379		}
1380	}
1381
1382	if (pc->param_nr) {
1383		int rid = 0;
1384
1385		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
1386		if (!pc->param)
1387			goto out_err;
1388
1389		for (i = 0; i < pc->param_nr; i++) {
1390			for (c = 0; c < 4; c++) {
1391				pc->param[i*4+c].type = P_CONST;
1392				pc->param[i*4+c].hw = rid++;
1393				pc->param[i*4+c].index = i;
1394			}
1395		}
1396	}
1397
1398	if (pc->immd_nr) {
1399		int rid = pc->param_nr * 4;
1400
1401		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
1402		if (!pc->immd)
1403			goto out_err;
1404
1405		for (i = 0; i < pc->immd_nr; i++) {
1406			for (c = 0; c < 4; c++) {
1407				pc->immd[i*4+c].type = P_IMMD;
1408				pc->immd[i*4+c].hw = rid++;
1409				pc->immd[i*4+c].index = i;
1410			}
1411		}
1412	}
1413
1414	ret = TRUE;
1415out_err:
1416	tgsi_parse_free(&p);
1417	return ret;
1418}
1419
1420static boolean
1421nv50_program_tx(struct nv50_program *p)
1422{
1423	struct tgsi_parse_context parse;
1424	struct nv50_pc *pc;
1425	boolean ret;
1426
1427	pc = CALLOC_STRUCT(nv50_pc);
1428	if (!pc)
1429		return FALSE;
1430	pc->p = p;
1431	pc->p->cfg.high_temp = 4;
1432
1433	ret = nv50_program_tx_prep(pc);
1434	if (ret == FALSE)
1435		goto out_cleanup;
1436
1437	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1438	while (!tgsi_parse_end_of_tokens(&parse)) {
1439		const union tgsi_full_token *tok = &parse.FullToken;
1440
1441		tgsi_parse_token(&parse);
1442
1443		switch (tok->Token.Type) {
1444		case TGSI_TOKEN_TYPE_INSTRUCTION:
1445			ret = nv50_program_tx_insn(pc, tok);
1446			if (ret == FALSE)
1447				goto out_err;
1448			break;
1449		default:
1450			break;
1451		}
1452	}
1453
1454	if (p->type == PIPE_SHADER_FRAGMENT) {
1455		struct nv50_reg out;
1456
1457		out.type = P_TEMP;
1458		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
1459			emit_mov(pc, &out, &pc->result[out.hw]);
1460	}
1461
1462	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
1463	pc->p->exec_tail->inst[1] |= 0x00000001;
1464
1465	p->param_nr = pc->param_nr * 4;
1466	p->immd_nr = pc->immd_nr * 4;
1467	p->immd = pc->immd_buf;
1468
1469out_err:
1470	tgsi_parse_free(&parse);
1471
1472out_cleanup:
1473	return ret;
1474}
1475
1476static void
1477nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1478{
1479	if (nv50_program_tx(p) == FALSE)
1480		assert(0);
1481	p->translated = TRUE;
1482}
1483
1484static void
1485nv50_program_upload_data(struct nv50_context *nv50, float *map,
1486			 unsigned start, unsigned count)
1487{
1488	while (count) {
1489		unsigned nr = count > 2047 ? 2047 : count;
1490
1491		BEGIN_RING(tesla, 0x00000f00, 1);
1492		OUT_RING  ((NV50_CB_PMISC << 0) | (start << 8));
1493		BEGIN_RING(tesla, 0x40000f04, nr);
1494		OUT_RINGp (map, nr);
1495
1496		map += nr;
1497		start += nr;
1498		count -= nr;
1499	}
1500}
1501
1502static void
1503nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1504{
1505	struct nouveau_winsys *nvws = nv50->screen->nvws;
1506	struct pipe_winsys *ws = nv50->pipe.winsys;
1507	unsigned nr = p->param_nr + p->immd_nr;
1508
1509	if (!p->data && nr) {
1510		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
1511
1512		if (nvws->res_alloc(heap, nr, p, &p->data)) {
1513			while (heap->next && heap->size < nr) {
1514				struct nv50_program *evict = heap->next->priv;
1515				nvws->res_free(&evict->data);
1516			}
1517
1518			if (nvws->res_alloc(heap, nr, p, &p->data))
1519				assert(0);
1520		}
1521	}
1522
1523	if (p->param_nr) {
1524		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
1525					    PIPE_BUFFER_USAGE_CPU_READ);
1526		nv50_program_upload_data(nv50, map, p->data->start,
1527					 p->param_nr);
1528		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
1529	}
1530
1531	if (p->immd_nr) {
1532		nv50_program_upload_data(nv50, p->immd,
1533					 p->data->start + p->param_nr,
1534					 p->immd_nr);
1535	}
1536}
1537
1538static void
1539nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1540{
1541	struct pipe_winsys *ws = nv50->pipe.winsys;
1542	struct nv50_program_exec *e;
1543	struct nouveau_stateobj *so;
1544	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
1545	unsigned start, count, *up, *ptr;
1546	boolean upload = FALSE;
1547
1548	if (!p->buffer) {
1549		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
1550		upload = TRUE;
1551	}
1552
1553	if (p->data && p->data->start != p->data_start) {
1554		for (e = p->exec_head; e; e = e->next) {
1555			unsigned ei, ci;
1556
1557			if (e->param.index < 0)
1558				continue;
1559			ei = e->param.shift >> 5;
1560			ci = e->param.index + p->data->start;
1561
1562			e->inst[ei] &= ~e->param.mask;
1563			e->inst[ei] |= (ci << e->param.shift);
1564		}
1565
1566		p->data_start = p->data->start;
1567		upload = TRUE;
1568	}
1569
1570	if (!upload)
1571		return;
1572
1573	up = ptr = MALLOC(p->exec_size * 4);
1574	for (e = p->exec_head; e; e = e->next) {
1575		*(ptr++) = e->inst[0];
1576		if (is_long(e))
1577			*(ptr++) = e->inst[1];
1578	}
1579
1580	so = so_new(4,2);
1581	so_method(so, nv50->screen->tesla, 0x1280, 3);
1582	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
1583	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
1584	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
1585
1586	start = 0; count = p->exec_size;
1587	while (count) {
1588		struct nouveau_winsys *nvws = nv50->screen->nvws;
1589		unsigned nr;
1590
1591		so_emit(nvws, so);
1592
1593		nr = MIN2(count, 2047);
1594		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
1595		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
1596			FIRE_RING(NULL);
1597			continue;
1598		}
1599
1600		BEGIN_RING(tesla, 0x0f00, 1);
1601		OUT_RING  ((start << 8) | NV50_CB_PUPLOAD);
1602		BEGIN_RING(tesla, 0x40000f04, nr);
1603		OUT_RINGp (up + start, nr);
1604
1605		start += nr;
1606		count -= nr;
1607	}
1608
1609	FREE(up);
1610	so_ref(NULL, &so);
1611}
1612
1613void
1614nv50_vertprog_validate(struct nv50_context *nv50)
1615{
1616	struct nouveau_grobj *tesla = nv50->screen->tesla;
1617	struct nv50_program *p = nv50->vertprog;
1618	struct nouveau_stateobj *so;
1619
1620	if (!p->translated) {
1621		nv50_program_validate(nv50, p);
1622		if (!p->translated)
1623			assert(0);
1624	}
1625
1626	nv50_program_validate_data(nv50, p);
1627	nv50_program_validate_code(nv50, p);
1628
1629	so = so_new(13, 2);
1630	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1631	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1632		  NOUVEAU_BO_HIGH, 0, 0);
1633	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1634		  NOUVEAU_BO_LOW, 0, 0);
1635	so_method(so, tesla, 0x1650, 2);
1636	so_data  (so, p->cfg.vp.attr[0]);
1637	so_data  (so, p->cfg.vp.attr[1]);
1638	so_method(so, tesla, 0x16b8, 1);
1639	so_data  (so, p->cfg.high_result);
1640	so_method(so, tesla, 0x16ac, 2);
1641	so_data  (so, p->cfg.high_result); //8);
1642	so_data  (so, p->cfg.high_temp);
1643	so_method(so, tesla, 0x140c, 1);
1644	so_data  (so, 0); /* program start offset */
1645	so_ref(so, &nv50->state.vertprog);
1646}
1647
1648void
1649nv50_fragprog_validate(struct nv50_context *nv50)
1650{
1651	struct nouveau_grobj *tesla = nv50->screen->tesla;
1652	struct nv50_program *p = nv50->fragprog;
1653	struct nouveau_stateobj *so;
1654
1655	if (!p->translated) {
1656		nv50_program_validate(nv50, p);
1657		if (!p->translated)
1658			assert(0);
1659	}
1660
1661	nv50_program_validate_data(nv50, p);
1662	nv50_program_validate_code(nv50, p);
1663
1664	so = so_new(64, 2);
1665	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1666	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1667		  NOUVEAU_BO_HIGH, 0, 0);
1668	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1669		  NOUVEAU_BO_LOW, 0, 0);
1670	so_method(so, tesla, 0x1904, 4);
1671	so_data  (so, 0x01040404); /* p: 0x01000404 */
1672	so_data  (so, 0x00000004);
1673	so_data  (so, 0x00000000);
1674	so_data  (so, 0x00000000);
1675	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
1676	so_data  (so, 0x03020100);
1677	so_data  (so, 0x07060504);
1678	so_data  (so, 0x0b0a0908);
1679	so_method(so, tesla, 0x1988, 2);
1680	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
1681	so_data  (so, p->cfg.high_temp);
1682	so_method(so, tesla, 0x1414, 1);
1683	so_data  (so, 0); /* program start offset */
1684	so_ref(so, &nv50->state.fragprog);
1685}
1686
1687void
1688nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1689{
1690	struct pipe_winsys *ws = nv50->pipe.winsys;
1691
1692	while (p->exec_head) {
1693		struct nv50_program_exec *e = p->exec_head;
1694
1695		p->exec_head = e->next;
1696		FREE(e);
1697	}
1698	p->exec_tail = NULL;
1699	p->exec_size = 0;
1700
1701	if (p->buffer)
1702		pipe_buffer_reference(ws, &p->buffer, NULL);
1703
1704	nv50->screen->nvws->res_free(&p->data);
1705
1706	p->translated = 0;
1707}
1708
1709