nv50_program.c revision f579a99cc608eaba6f617c11ab0aec7f3e9ef953
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88
89	int rhw; /* result hw for FP outputs, or interpolant index */
90	int acc; /* instruction where this reg is last read (first insn == 1) */
91};
92
93struct nv50_pc {
94	struct nv50_program *p;
95
96	/* hw resources */
97	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	unsigned interp_mode[32];
116	/* perspective interpolation registers */
117	struct nv50_reg *iv_p;
118	struct nv50_reg *iv_c;
119
120	/* current instruction and total number of insns */
121	unsigned insn_cur;
122	unsigned insn_nr;
123};
124
125static void
126alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
127{
128	int i = 0;
129
130	if (reg->type == P_RESULT) {
131		if (pc->p->cfg.high_result < (reg->hw + 1))
132			pc->p->cfg.high_result = reg->hw + 1;
133	}
134
135	if (reg->type != P_TEMP)
136		return;
137
138	if (reg->hw >= 0) {
139		/*XXX: do this here too to catch FP temp-as-attr usage..
140		 *     not clean, but works */
141		if (pc->p->cfg.high_temp < (reg->hw + 1))
142			pc->p->cfg.high_temp = reg->hw + 1;
143		return;
144	}
145
146	if (reg->rhw != -1) {
147		/* try to allocate temporary with index rhw first */
148		if (!(pc->r_temp[reg->rhw])) {
149			pc->r_temp[reg->rhw] = reg;
150			reg->hw = reg->rhw;
151			if (pc->p->cfg.high_temp < (reg->rhw + 1))
152				pc->p->cfg.high_temp = reg->rhw + 1;
153			return;
154		}
155		/* make sure we don't get things like $r0 needs to go
156		 * in $r1 and $r1 in $r0
157		 */
158		i = pc->result_nr * 4;
159	}
160
161	for (; i < NV50_SU_MAX_TEMP; i++) {
162		if (!(pc->r_temp[i])) {
163			pc->r_temp[i] = reg;
164			reg->hw = i;
165			if (pc->p->cfg.high_temp < (i + 1))
166				pc->p->cfg.high_temp = i + 1;
167			return;
168		}
169	}
170
171	assert(0);
172}
173
174static struct nv50_reg *
175alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
176{
177	struct nv50_reg *r;
178	int i;
179
180	if (dst && dst->type == P_TEMP && dst->hw == -1)
181		return dst;
182
183	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
184		if (!pc->r_temp[i]) {
185			r = CALLOC_STRUCT(nv50_reg);
186			r->type = P_TEMP;
187			r->index = -1;
188			r->hw = i;
189			r->rhw = -1;
190			pc->r_temp[i] = r;
191			return r;
192		}
193	}
194
195	assert(0);
196	return NULL;
197}
198
199static void
200free_temp(struct nv50_pc *pc, struct nv50_reg *r)
201{
202	if (r->index == -1) {
203		unsigned hw = r->hw;
204
205		FREE(pc->r_temp[hw]);
206		pc->r_temp[hw] = NULL;
207	}
208}
209
210static int
211alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
212{
213	int i;
214
215	if ((idx + 4) >= NV50_SU_MAX_TEMP)
216		return 1;
217
218	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
219	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
220		return alloc_temp4(pc, dst, idx + 1);
221
222	for (i = 0; i < 4; i++) {
223		dst[i] = CALLOC_STRUCT(nv50_reg);
224		dst[i]->type = P_TEMP;
225		dst[i]->index = -1;
226		dst[i]->hw = idx + i;
227		pc->r_temp[idx + i] = dst[i];
228	}
229
230	return 0;
231}
232
233static void
234free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
235{
236	int i;
237
238	for (i = 0; i < 4; i++)
239		free_temp(pc, reg[i]);
240}
241
242static struct nv50_reg *
243temp_temp(struct nv50_pc *pc)
244{
245	if (pc->temp_temp_nr >= 16)
246		assert(0);
247
248	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
249	return pc->temp_temp[pc->temp_temp_nr++];
250}
251
252static void
253kill_temp_temp(struct nv50_pc *pc)
254{
255	int i;
256
257	for (i = 0; i < pc->temp_temp_nr; i++)
258		free_temp(pc, pc->temp_temp[i]);
259	pc->temp_temp_nr = 0;
260}
261
262static int
263ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
264{
265	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
266			       (pc->immd_nr + 1) * 4 * sizeof(float));
267	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
268	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
269	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
270	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
271
272	return pc->immd_nr++;
273}
274
275static struct nv50_reg *
276alloc_immd(struct nv50_pc *pc, float f)
277{
278	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
279	unsigned hw;
280
281	for (hw = 0; hw < pc->immd_nr * 4; hw++)
282		if (pc->immd_buf[hw] == f)
283			break;
284
285	if (hw == pc->immd_nr * 4)
286		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
287
288	r->type = P_IMMD;
289	r->hw = hw;
290	r->index = -1;
291	return r;
292}
293
294static struct nv50_program_exec *
295exec(struct nv50_pc *pc)
296{
297	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
298
299	e->param.index = -1;
300	return e;
301}
302
303static void
304emit(struct nv50_pc *pc, struct nv50_program_exec *e)
305{
306	struct nv50_program *p = pc->p;
307
308	if (p->exec_tail)
309		p->exec_tail->next = e;
310	if (!p->exec_head)
311		p->exec_head = e;
312	p->exec_tail = e;
313	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
314}
315
316static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
317
318static boolean
319is_long(struct nv50_program_exec *e)
320{
321	if (e->inst[0] & 1)
322		return TRUE;
323	return FALSE;
324}
325
326static boolean
327is_immd(struct nv50_program_exec *e)
328{
329	if (is_long(e) && (e->inst[1] & 3) == 3)
330		return TRUE;
331	return FALSE;
332}
333
334static INLINE void
335set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
336	 struct nv50_program_exec *e)
337{
338	set_long(pc, e);
339	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
340	e->inst[1] |= (pred << 7) | (idx << 12);
341}
342
343static INLINE void
344set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
345	    struct nv50_program_exec *e)
346{
347	set_long(pc, e);
348	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
349	e->inst[1] |= (idx << 4) | (on << 6);
350}
351
352static INLINE void
353set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
354{
355	if (is_long(e))
356		return;
357
358	e->inst[0] |= 1;
359	set_pred(pc, 0xf, 0, e);
360	set_pred_wr(pc, 0, 0, e);
361}
362
363static INLINE void
364set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
365{
366	if (dst->type == P_RESULT) {
367		set_long(pc, e);
368		e->inst[1] |= 0x00000008;
369	}
370
371	alloc_reg(pc, dst);
372	e->inst[0] |= (dst->hw << 2);
373}
374
375static INLINE void
376set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
377{
378	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
379
380	set_long(pc, e);
381	/*XXX: can't be predicated - bits overlap.. catch cases where both
382	 *     are required and avoid them. */
383	set_pred(pc, 0, 0, e);
384	set_pred_wr(pc, 0, 0, e);
385
386	e->inst[1] |= 0x00000002 | 0x00000001;
387	e->inst[0] |= (val & 0x3f) << 16;
388	e->inst[1] |= (val >> 6) << 2;
389}
390
391
392#define INTERP_LINEAR		0
393#define INTERP_FLAT			1
394#define INTERP_PERSPECTIVE	2
395#define INTERP_CENTROID		4
396
397/* interpolant index has been stored in dst->rhw */
398static void
399emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
400		unsigned mode)
401{
402	assert(dst->rhw != -1);
403	struct nv50_program_exec *e = exec(pc);
404
405	e->inst[0] |= 0x80000000;
406	set_dst(pc, dst, e);
407	e->inst[0] |= (dst->rhw << 16);
408
409	if (mode & INTERP_FLAT) {
410		e->inst[0] |= (1 << 8);
411	} else {
412		if (mode & INTERP_PERSPECTIVE) {
413			e->inst[0] |= (1 << 25);
414			alloc_reg(pc, iv);
415			e->inst[0] |= (iv->hw << 9);
416		}
417
418		if (mode & INTERP_CENTROID)
419			e->inst[0] |= (1 << 24);
420	}
421
422	emit(pc, e);
423}
424
425static void
426set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
427	 struct nv50_program_exec *e)
428{
429	set_long(pc, e);
430#if 1
431	e->inst[1] |= (1 << 22);
432#else
433	if (src->type == P_IMMD) {
434		e->inst[1] |= (NV50_CB_PMISC << 22);
435	} else {
436		if (pc->p->type == PIPE_SHADER_VERTEX)
437			e->inst[1] |= (NV50_CB_PVP << 22);
438		else
439			e->inst[1] |= (NV50_CB_PFP << 22);
440	}
441#endif
442
443	e->param.index = src->hw;
444	e->param.shift = s;
445	e->param.mask = m << (s % 32);
446}
447
448static void
449emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
450{
451	struct nv50_program_exec *e = exec(pc);
452
453	e->inst[0] |= 0x10000000;
454
455	set_dst(pc, dst, e);
456
457	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
458		set_immd(pc, src, e);
459		/*XXX: 32-bit, but steals part of "half" reg space - need to
460		 *     catch and handle this case if/when we do half-regs
461		 */
462		e->inst[0] |= 0x00008000;
463	} else
464	if (src->type == P_IMMD || src->type == P_CONST) {
465		set_long(pc, e);
466		set_data(pc, src, 0x7f, 9, e);
467		e->inst[1] |= 0x20000000; /* src0 const? */
468	} else {
469		if (src->type == P_ATTR) {
470			set_long(pc, e);
471			e->inst[1] |= 0x00200000;
472		}
473
474		alloc_reg(pc, src);
475		e->inst[0] |= (src->hw << 9);
476	}
477
478	/* We really should support "half" instructions here at some point,
479	 * but I don't feel confident enough about them yet.
480	 */
481	set_long(pc, e);
482	if (is_long(e) && !is_immd(e)) {
483		e->inst[1] |= 0x04000000; /* 32-bit */
484		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
485	}
486
487	emit(pc, e);
488}
489
490static INLINE void
491emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
492{
493	struct nv50_reg *imm = alloc_immd(pc, f);
494	emit_mov(pc, dst, imm);
495	FREE(imm);
496}
497
498static boolean
499check_swap_src_0_1(struct nv50_pc *pc,
500		   struct nv50_reg **s0, struct nv50_reg **s1)
501{
502	struct nv50_reg *src0 = *s0, *src1 = *s1;
503
504	if (src0->type == P_CONST) {
505		if (src1->type != P_CONST) {
506			*s0 = src1;
507			*s1 = src0;
508			return TRUE;
509		}
510	} else
511	if (src1->type == P_ATTR) {
512		if (src0->type != P_ATTR) {
513			*s0 = src1;
514			*s1 = src0;
515			return TRUE;
516		}
517	}
518
519	return FALSE;
520}
521
522static void
523set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
524{
525	if (src->type == P_ATTR) {
526		set_long(pc, e);
527		e->inst[1] |= 0x00200000;
528	} else
529	if (src->type == P_CONST || src->type == P_IMMD) {
530		struct nv50_reg *temp = temp_temp(pc);
531
532		emit_mov(pc, temp, src);
533		src = temp;
534	}
535
536	alloc_reg(pc, src);
537	e->inst[0] |= (src->hw << 9);
538}
539
540static void
541set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
542{
543	if (src->type == P_ATTR) {
544		struct nv50_reg *temp = temp_temp(pc);
545
546		emit_mov(pc, temp, src);
547		src = temp;
548	} else
549	if (src->type == P_CONST || src->type == P_IMMD) {
550		assert(!(e->inst[0] & 0x00800000));
551		if (e->inst[0] & 0x01000000) {
552			struct nv50_reg *temp = temp_temp(pc);
553
554			emit_mov(pc, temp, src);
555			src = temp;
556		} else {
557			set_data(pc, src, 0x7f, 16, e);
558			e->inst[0] |= 0x00800000;
559		}
560	}
561
562	alloc_reg(pc, src);
563	e->inst[0] |= (src->hw << 16);
564}
565
566static void
567set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
568{
569	set_long(pc, e);
570
571	if (src->type == P_ATTR) {
572		struct nv50_reg *temp = temp_temp(pc);
573
574		emit_mov(pc, temp, src);
575		src = temp;
576	} else
577	if (src->type == P_CONST || src->type == P_IMMD) {
578		assert(!(e->inst[0] & 0x01000000));
579		if (e->inst[0] & 0x00800000) {
580			struct nv50_reg *temp = temp_temp(pc);
581
582			emit_mov(pc, temp, src);
583			src = temp;
584		} else {
585			set_data(pc, src, 0x7f, 32+14, e);
586			e->inst[0] |= 0x01000000;
587		}
588	}
589
590	alloc_reg(pc, src);
591	e->inst[1] |= (src->hw << 14);
592}
593
594static void
595emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
596	 struct nv50_reg *src1)
597{
598	struct nv50_program_exec *e = exec(pc);
599
600	e->inst[0] |= 0xc0000000;
601	set_long(pc, e);
602
603	check_swap_src_0_1(pc, &src0, &src1);
604	set_dst(pc, dst, e);
605	set_src_0(pc, src0, e);
606	set_src_1(pc, src1, e);
607
608	emit(pc, e);
609}
610
611static void
612emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
613	 struct nv50_reg *src0, struct nv50_reg *src1)
614{
615	struct nv50_program_exec *e = exec(pc);
616
617	e->inst[0] |= 0xb0000000;
618
619	check_swap_src_0_1(pc, &src0, &src1);
620	set_dst(pc, dst, e);
621	set_src_0(pc, src0, e);
622	if (is_long(e))
623		set_src_2(pc, src1, e);
624	else
625		set_src_1(pc, src1, e);
626
627	emit(pc, e);
628}
629
630static void
631emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
632	    struct nv50_reg *src0, struct nv50_reg *src1)
633{
634	struct nv50_program_exec *e = exec(pc);
635
636	set_long(pc, e);
637	e->inst[0] |= 0xb0000000;
638	e->inst[1] |= (sub << 29);
639
640	check_swap_src_0_1(pc, &src0, &src1);
641	set_dst(pc, dst, e);
642	set_src_0(pc, src0, e);
643	set_src_1(pc, src1, e);
644
645	emit(pc, e);
646}
647
648static void
649emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
650	 struct nv50_reg *src1)
651{
652	struct nv50_program_exec *e = exec(pc);
653
654	e->inst[0] |= 0xb0000000;
655
656	set_long(pc, e);
657	if (check_swap_src_0_1(pc, &src0, &src1))
658		e->inst[1] |= 0x04000000;
659	else
660		e->inst[1] |= 0x08000000;
661
662	set_dst(pc, dst, e);
663	set_src_0(pc, src0, e);
664	set_src_2(pc, src1, e);
665
666	emit(pc, e);
667}
668
669static void
670emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
671	 struct nv50_reg *src1, struct nv50_reg *src2)
672{
673	struct nv50_program_exec *e = exec(pc);
674
675	e->inst[0] |= 0xe0000000;
676
677	check_swap_src_0_1(pc, &src0, &src1);
678	set_dst(pc, dst, e);
679	set_src_0(pc, src0, e);
680	set_src_1(pc, src1, e);
681	set_src_2(pc, src2, e);
682
683	emit(pc, e);
684}
685
686static void
687emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
688	 struct nv50_reg *src1, struct nv50_reg *src2)
689{
690	struct nv50_program_exec *e = exec(pc);
691
692	e->inst[0] |= 0xe0000000;
693	set_long(pc, e);
694	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
695
696	check_swap_src_0_1(pc, &src0, &src1);
697	set_dst(pc, dst, e);
698	set_src_0(pc, src0, e);
699	set_src_1(pc, src1, e);
700	set_src_2(pc, src2, e);
701
702	emit(pc, e);
703}
704
705static void
706emit_flop(struct nv50_pc *pc, unsigned sub,
707	  struct nv50_reg *dst, struct nv50_reg *src)
708{
709	struct nv50_program_exec *e = exec(pc);
710
711	e->inst[0] |= 0x90000000;
712	if (sub) {
713		set_long(pc, e);
714		e->inst[1] |= (sub << 29);
715	}
716
717	set_dst(pc, dst, e);
718	set_src_0(pc, src, e);
719
720	emit(pc, e);
721}
722
723static void
724emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
725{
726	struct nv50_program_exec *e = exec(pc);
727
728	e->inst[0] |= 0xb0000000;
729
730	set_dst(pc, dst, e);
731	set_src_0(pc, src, e);
732	set_long(pc, e);
733	e->inst[1] |= (6 << 29) | 0x00004000;
734
735	emit(pc, e);
736}
737
738static void
739emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
740{
741	struct nv50_program_exec *e = exec(pc);
742
743	e->inst[0] |= 0xb0000000;
744
745	set_dst(pc, dst, e);
746	set_src_0(pc, src, e);
747	set_long(pc, e);
748	e->inst[1] |= (6 << 29);
749
750	emit(pc, e);
751}
752
753static void
754emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
755	 struct nv50_reg *src0, struct nv50_reg *src1)
756{
757	struct nv50_program_exec *e = exec(pc);
758	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
759	struct nv50_reg *rdst;
760
761	assert(c_op <= 7);
762	if (check_swap_src_0_1(pc, &src0, &src1))
763		c_op = inv_cop[c_op];
764
765	rdst = dst;
766	if (dst->type != P_TEMP)
767		dst = alloc_temp(pc, NULL);
768
769	/* set.u32 */
770	set_long(pc, e);
771	e->inst[0] |= 0xb0000000;
772	e->inst[1] |= (3 << 29);
773	e->inst[1] |= (c_op << 14);
774	/*XXX: breaks things, .u32 by default?
775	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
776	 *     doesn't seem to match what the hw actually does.
777	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
778	 */
779	set_dst(pc, dst, e);
780	set_src_0(pc, src0, e);
781	set_src_1(pc, src1, e);
782	emit(pc, e);
783
784	/* cvt.f32.u32 */
785	e = exec(pc);
786	e->inst[0] = 0xa0000001;
787	e->inst[1] = 0x64014780;
788	set_dst(pc, rdst, e);
789	set_src_0(pc, dst, e);
790	emit(pc, e);
791
792	if (dst != rdst)
793		free_temp(pc, dst);
794}
795
796static void
797emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
798{
799	struct nv50_program_exec *e = exec(pc);
800
801	e->inst[0] = 0xa0000000; /* cvt */
802	set_long(pc, e);
803	e->inst[1] |= (6 << 29); /* cvt */
804	e->inst[1] |= 0x08000000; /* integer mode */
805	e->inst[1] |= 0x04000000; /* 32 bit */
806	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
807	e->inst[1] |= (1 << 14); /* src .f32 */
808	set_dst(pc, dst, e);
809	set_src_0(pc, src, e);
810
811	emit(pc, e);
812}
813
814static void
815emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
816	 struct nv50_reg *v, struct nv50_reg *e)
817{
818	struct nv50_reg *temp = alloc_temp(pc, NULL);
819
820	emit_flop(pc, 3, temp, v);
821	emit_mul(pc, temp, temp, e);
822	emit_preex2(pc, temp, temp);
823	emit_flop(pc, 6, dst, temp);
824
825	free_temp(pc, temp);
826}
827
828static void
829emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
830{
831	struct nv50_program_exec *e = exec(pc);
832
833	e->inst[0] = 0xa0000000; /* cvt */
834	set_long(pc, e);
835	e->inst[1] |= (6 << 29); /* cvt */
836	e->inst[1] |= 0x04000000; /* 32 bit */
837	e->inst[1] |= (1 << 14); /* src .f32 */
838	e->inst[1] |= ((1 << 6) << 14); /* .abs */
839	set_dst(pc, dst, e);
840	set_src_0(pc, src, e);
841
842	emit(pc, e);
843}
844
845static void
846emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
847	 struct nv50_reg **src)
848{
849	struct nv50_reg *one = alloc_immd(pc, 1.0);
850	struct nv50_reg *zero = alloc_immd(pc, 0.0);
851	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
852	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
853	struct nv50_reg *tmp[4];
854
855	if (mask & (1 << 0))
856		emit_mov(pc, dst[0], one);
857
858	if (mask & (1 << 3))
859		emit_mov(pc, dst[3], one);
860
861	if (mask & (3 << 1)) {
862		if (mask & (1 << 1))
863			tmp[0] = dst[1];
864		else
865			tmp[0] = temp_temp(pc);
866		emit_minmax(pc, 4, tmp[0], src[0], zero);
867	}
868
869	if (mask & (1 << 2)) {
870		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
871
872		tmp[1] = temp_temp(pc);
873		emit_minmax(pc, 4, tmp[1], src[1], zero);
874
875		tmp[3] = temp_temp(pc);
876		emit_minmax(pc, 4, tmp[3], src[3], neg128);
877		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
878
879		emit_pow(pc, dst[2], tmp[1], tmp[3]);
880		emit_mov(pc, dst[2], zero);
881		set_pred(pc, 3, 0, pc->p->exec_tail);
882	}
883
884	FREE(pos128);
885	FREE(neg128);
886	FREE(zero);
887	FREE(one);
888}
889
890static void
891emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
892{
893	struct nv50_program_exec *e = exec(pc);
894
895	set_long(pc, e);
896	e->inst[0] |= 0xa0000000; /* delta */
897	e->inst[1] |= (7 << 29); /* delta */
898	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
899	e->inst[1] |= (1 << 14); /* src .f32 */
900	set_dst(pc, dst, e);
901	set_src_0(pc, src, e);
902
903	emit(pc, e);
904}
905
906static void
907emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
908{
909	struct nv50_program_exec *e;
910	const int r_pred = 1;
911
912	/* Sets predicate reg ? */
913	e = exec(pc);
914	e->inst[0] = 0xa00001fd;
915	e->inst[1] = 0xc4014788;
916	set_src_0(pc, src, e);
917	set_pred_wr(pc, 1, r_pred, e);
918	emit(pc, e);
919
920	/* This is probably KILP */
921	e = exec(pc);
922	e->inst[0] = 0x000001fe;
923	set_long(pc, e);
924	set_pred(pc, 1 /* LT? */, r_pred, e);
925	emit(pc, e);
926}
927
928static struct nv50_reg *
929tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
930{
931	switch (dst->DstRegister.File) {
932	case TGSI_FILE_TEMPORARY:
933		return &pc->temp[dst->DstRegister.Index * 4 + c];
934	case TGSI_FILE_OUTPUT:
935		return &pc->result[dst->DstRegister.Index * 4 + c];
936	case TGSI_FILE_NULL:
937		return NULL;
938	default:
939		break;
940	}
941
942	return NULL;
943}
944
945static struct nv50_reg *
946tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
947{
948	struct nv50_reg *r = NULL;
949	struct nv50_reg *temp;
950	unsigned sgn, c;
951
952	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
953
954	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
955	switch (c) {
956	case TGSI_EXTSWIZZLE_X:
957	case TGSI_EXTSWIZZLE_Y:
958	case TGSI_EXTSWIZZLE_Z:
959	case TGSI_EXTSWIZZLE_W:
960		switch (src->SrcRegister.File) {
961		case TGSI_FILE_INPUT:
962			r = &pc->attr[src->SrcRegister.Index * 4 + c];
963			break;
964		case TGSI_FILE_TEMPORARY:
965			r = &pc->temp[src->SrcRegister.Index * 4 + c];
966			break;
967		case TGSI_FILE_CONSTANT:
968			r = &pc->param[src->SrcRegister.Index * 4 + c];
969			break;
970		case TGSI_FILE_IMMEDIATE:
971			r = &pc->immd[src->SrcRegister.Index * 4 + c];
972			break;
973		case TGSI_FILE_SAMPLER:
974			break;
975		default:
976			assert(0);
977			break;
978		}
979		break;
980	case TGSI_EXTSWIZZLE_ZERO:
981		r = alloc_immd(pc, 0.0);
982		return r;
983	case TGSI_EXTSWIZZLE_ONE:
984		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
985			return alloc_immd(pc, -1.0);
986		return alloc_immd(pc, 1.0);
987	default:
988		assert(0);
989		break;
990	}
991
992	switch (sgn) {
993	case TGSI_UTIL_SIGN_KEEP:
994		break;
995	case TGSI_UTIL_SIGN_CLEAR:
996		temp = temp_temp(pc);
997		emit_abs(pc, temp, r);
998		r = temp;
999		break;
1000	case TGSI_UTIL_SIGN_TOGGLE:
1001		temp = temp_temp(pc);
1002		emit_neg(pc, temp, r);
1003		r = temp;
1004		break;
1005	case TGSI_UTIL_SIGN_SET:
1006		temp = temp_temp(pc);
1007		emit_abs(pc, temp, r);
1008		emit_neg(pc, temp, temp);
1009		r = temp;
1010		break;
1011	default:
1012		assert(0);
1013		break;
1014	}
1015
1016	return r;
1017}
1018
1019static boolean
1020nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1021{
1022	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
1023	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
1024	unsigned mask, sat, unit;
1025	int i, c;
1026
1027	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1028	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1029
1030	for (c = 0; c < 4; c++) {
1031		if (mask & (1 << c))
1032			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1033		else
1034			dst[c] = NULL;
1035		rdst[c] = NULL;
1036		src[0][c] = NULL;
1037		src[1][c] = NULL;
1038		src[2][c] = NULL;
1039	}
1040
1041	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1042		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1043
1044		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1045			unit = fs->SrcRegister.Index;
1046
1047		for (c = 0; c < 4; c++)
1048			src[i][c] = tgsi_src(pc, c, fs);
1049	}
1050
1051	if (sat) {
1052		for (c = 0; c < 4; c++) {
1053			rdst[c] = dst[c];
1054			dst[c] = temp_temp(pc);
1055		}
1056	}
1057
1058	switch (inst->Instruction.Opcode) {
1059	case TGSI_OPCODE_ABS:
1060		for (c = 0; c < 4; c++) {
1061			if (!(mask & (1 << c)))
1062				continue;
1063			emit_abs(pc, dst[c], src[0][c]);
1064		}
1065		break;
1066	case TGSI_OPCODE_ADD:
1067		for (c = 0; c < 4; c++) {
1068			if (!(mask & (1 << c)))
1069				continue;
1070			emit_add(pc, dst[c], src[0][c], src[1][c]);
1071		}
1072		break;
1073	case TGSI_OPCODE_COS:
1074		temp = temp_temp(pc);
1075		emit_precossin(pc, temp, src[0][0]);
1076		emit_flop(pc, 5, temp, temp);
1077		for (c = 0; c < 4; c++) {
1078			if (!(mask & (1 << c)))
1079				continue;
1080			emit_mov(pc, dst[c], temp);
1081		}
1082		break;
1083	case TGSI_OPCODE_DP3:
1084		temp = temp_temp(pc);
1085		emit_mul(pc, temp, src[0][0], src[1][0]);
1086		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1087		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1088		for (c = 0; c < 4; c++) {
1089			if (!(mask & (1 << c)))
1090				continue;
1091			emit_mov(pc, dst[c], temp);
1092		}
1093		break;
1094	case TGSI_OPCODE_DP4:
1095		temp = temp_temp(pc);
1096		emit_mul(pc, temp, src[0][0], src[1][0]);
1097		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1098		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1099		emit_mad(pc, temp, src[0][3], src[1][3], temp);
1100		for (c = 0; c < 4; c++) {
1101			if (!(mask & (1 << c)))
1102				continue;
1103			emit_mov(pc, dst[c], temp);
1104		}
1105		break;
1106	case TGSI_OPCODE_DPH:
1107		temp = temp_temp(pc);
1108		emit_mul(pc, temp, src[0][0], src[1][0]);
1109		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1110		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1111		emit_add(pc, temp, src[1][3], temp);
1112		for (c = 0; c < 4; c++) {
1113			if (!(mask & (1 << c)))
1114				continue;
1115			emit_mov(pc, dst[c], temp);
1116		}
1117		break;
1118	case TGSI_OPCODE_DST:
1119	{
1120		struct nv50_reg *one = alloc_immd(pc, 1.0);
1121		if (mask & (1 << 0))
1122			emit_mov(pc, dst[0], one);
1123		if (mask & (1 << 1))
1124			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1125		if (mask & (1 << 2))
1126			emit_mov(pc, dst[2], src[0][2]);
1127		if (mask & (1 << 3))
1128			emit_mov(pc, dst[3], src[1][3]);
1129		FREE(one);
1130	}
1131		break;
1132	case TGSI_OPCODE_EX2:
1133		temp = temp_temp(pc);
1134		emit_preex2(pc, temp, src[0][0]);
1135		emit_flop(pc, 6, temp, temp);
1136		for (c = 0; c < 4; c++) {
1137			if (!(mask & (1 << c)))
1138				continue;
1139			emit_mov(pc, dst[c], temp);
1140		}
1141		break;
1142	case TGSI_OPCODE_FLR:
1143		for (c = 0; c < 4; c++) {
1144			if (!(mask & (1 << c)))
1145				continue;
1146			emit_flr(pc, dst[c], src[0][c]);
1147		}
1148		break;
1149	case TGSI_OPCODE_FRC:
1150		temp = temp_temp(pc);
1151		for (c = 0; c < 4; c++) {
1152			if (!(mask & (1 << c)))
1153				continue;
1154			emit_flr(pc, temp, src[0][c]);
1155			emit_sub(pc, dst[c], src[0][c], temp);
1156		}
1157		break;
1158	case TGSI_OPCODE_KIL:
1159		emit_kil(pc, src[0][0]);
1160		emit_kil(pc, src[0][1]);
1161		emit_kil(pc, src[0][2]);
1162		emit_kil(pc, src[0][3]);
1163		break;
1164	case TGSI_OPCODE_LIT:
1165		emit_lit(pc, &dst[0], mask, &src[0][0]);
1166		break;
1167	case TGSI_OPCODE_LG2:
1168		temp = temp_temp(pc);
1169		emit_flop(pc, 3, temp, src[0][0]);
1170		for (c = 0; c < 4; c++) {
1171			if (!(mask & (1 << c)))
1172				continue;
1173			emit_mov(pc, dst[c], temp);
1174		}
1175		break;
1176	case TGSI_OPCODE_LRP:
1177		temp = temp_temp(pc);
1178		for (c = 0; c < 4; c++) {
1179			if (!(mask & (1 << c)))
1180				continue;
1181			emit_sub(pc, temp, src[1][c], src[2][c]);
1182			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1183		}
1184		break;
1185	case TGSI_OPCODE_MAD:
1186		for (c = 0; c < 4; c++) {
1187			if (!(mask & (1 << c)))
1188				continue;
1189			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1190		}
1191		break;
1192	case TGSI_OPCODE_MAX:
1193		for (c = 0; c < 4; c++) {
1194			if (!(mask & (1 << c)))
1195				continue;
1196			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1197		}
1198		break;
1199	case TGSI_OPCODE_MIN:
1200		for (c = 0; c < 4; c++) {
1201			if (!(mask & (1 << c)))
1202				continue;
1203			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1204		}
1205		break;
1206	case TGSI_OPCODE_MOV:
1207		for (c = 0; c < 4; c++) {
1208			if (!(mask & (1 << c)))
1209				continue;
1210			emit_mov(pc, dst[c], src[0][c]);
1211		}
1212		break;
1213	case TGSI_OPCODE_MUL:
1214		for (c = 0; c < 4; c++) {
1215			if (!(mask & (1 << c)))
1216				continue;
1217			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1218		}
1219		break;
1220	case TGSI_OPCODE_POW:
1221		temp = temp_temp(pc);
1222		emit_pow(pc, temp, src[0][0], src[1][0]);
1223		for (c = 0; c < 4; c++) {
1224			if (!(mask & (1 << c)))
1225				continue;
1226			emit_mov(pc, dst[c], temp);
1227		}
1228		break;
1229	case TGSI_OPCODE_RCP:
1230		for (c = 0; c < 4; c++) {
1231			if (!(mask & (1 << c)))
1232				continue;
1233			emit_flop(pc, 0, dst[c], src[0][0]);
1234		}
1235		break;
1236	case TGSI_OPCODE_RSQ:
1237		for (c = 0; c < 4; c++) {
1238			if (!(mask & (1 << c)))
1239				continue;
1240			emit_flop(pc, 2, dst[c], src[0][0]);
1241		}
1242		break;
1243	case TGSI_OPCODE_SCS:
1244		temp = temp_temp(pc);
1245		emit_precossin(pc, temp, src[0][0]);
1246		if (mask & (1 << 0))
1247			emit_flop(pc, 5, dst[0], temp);
1248		if (mask & (1 << 1))
1249			emit_flop(pc, 4, dst[1], temp);
1250		if (mask & (1 << 2))
1251			emit_mov_immdval(pc, dst[2], 0.0);
1252		if (mask & (1 << 3))
1253			emit_mov_immdval(pc, dst[3], 1.0);
1254		break;
1255	case TGSI_OPCODE_SGE:
1256		for (c = 0; c < 4; c++) {
1257			if (!(mask & (1 << c)))
1258				continue;
1259			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1260		}
1261		break;
1262	case TGSI_OPCODE_SIN:
1263		temp = temp_temp(pc);
1264		emit_precossin(pc, temp, src[0][0]);
1265		emit_flop(pc, 4, temp, temp);
1266		for (c = 0; c < 4; c++) {
1267			if (!(mask & (1 << c)))
1268				continue;
1269			emit_mov(pc, dst[c], temp);
1270		}
1271		break;
1272	case TGSI_OPCODE_SLT:
1273		for (c = 0; c < 4; c++) {
1274			if (!(mask & (1 << c)))
1275				continue;
1276			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1277		}
1278		break;
1279	case TGSI_OPCODE_SUB:
1280		for (c = 0; c < 4; c++) {
1281			if (!(mask & (1 << c)))
1282				continue;
1283			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1284		}
1285		break;
1286	case TGSI_OPCODE_TEX:
1287	case TGSI_OPCODE_TXP:
1288	{
1289		struct nv50_reg *t[4];
1290		struct nv50_program_exec *e;
1291
1292		alloc_temp4(pc, t, 0);
1293		emit_mov(pc, t[0], src[0][0]);
1294		emit_mov(pc, t[1], src[0][1]);
1295
1296		e = exec(pc);
1297		e->inst[0] = 0xf6400000;
1298		e->inst[0] |= (unit << 9);
1299		set_long(pc, e);
1300		e->inst[1] |= 0x0000c004;
1301		set_dst(pc, t[0], e);
1302		emit(pc, e);
1303
1304		if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
1305		if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
1306		if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
1307		if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
1308
1309		free_temp4(pc, t);
1310	}
1311		break;
1312	case TGSI_OPCODE_XPD:
1313		temp = temp_temp(pc);
1314		if (mask & (1 << 0)) {
1315			emit_mul(pc, temp, src[0][2], src[1][1]);
1316			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1317		}
1318		if (mask & (1 << 1)) {
1319			emit_mul(pc, temp, src[0][0], src[1][2]);
1320			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1321		}
1322		if (mask & (1 << 2)) {
1323			emit_mul(pc, temp, src[0][1], src[1][0]);
1324			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1325		}
1326		if (mask & (1 << 3))
1327			emit_mov_immdval(pc, dst[3], 1.0);
1328		break;
1329	case TGSI_OPCODE_END:
1330		break;
1331	default:
1332		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1333		return FALSE;
1334	}
1335
1336	if (sat) {
1337		for (c = 0; c < 4; c++) {
1338			struct nv50_program_exec *e;
1339
1340			if (!(mask & (1 << c)))
1341				continue;
1342			e = exec(pc);
1343
1344			e->inst[0] = 0xa0000000; /* cvt */
1345			set_long(pc, e);
1346			e->inst[1] |= (6 << 29); /* cvt */
1347			e->inst[1] |= 0x04000000; /* 32 bit */
1348			e->inst[1] |= (1 << 14); /* src .f32 */
1349			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1350			set_dst(pc, rdst[c], e);
1351			set_src_0(pc, dst[c], e);
1352			emit(pc, e);
1353		}
1354	}
1355
1356	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1357		for (c = 0; c < 4; c++) {
1358			if (!src[i][c])
1359				continue;
1360			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1361				FREE(src[i][c]);
1362		}
1363	}
1364
1365	kill_temp_temp(pc);
1366	return TRUE;
1367}
1368
1369/* Adjust a bitmask that indicates what components of a source are used,
1370 * we use this in tx_prep so we only load interpolants that are needed.
1371 */
1372static void
1373insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
1374{
1375	const struct tgsi_instruction_ext_texture *tex;
1376
1377	switch (insn->Instruction.Opcode) {
1378	case TGSI_OPCODE_DP3:
1379		*mask = 0x7;
1380		break;
1381	case TGSI_OPCODE_DP4:
1382	case TGSI_OPCODE_DPH:
1383		*mask = 0xF;
1384		break;
1385	case TGSI_OPCODE_LIT:
1386		*mask = 0xB;
1387		break;
1388	case TGSI_OPCODE_RCP:
1389	case TGSI_OPCODE_RSQ:
1390		*mask = 0x1;
1391		break;
1392	case TGSI_OPCODE_TEX:
1393	case TGSI_OPCODE_TXP:
1394		assert(insn->Instruction.Extended);
1395		tex = &insn->InstructionExtTexture;
1396
1397		*mask = 0x7;
1398		if (tex->Texture == TGSI_TEXTURE_1D)
1399			*mask = 0x1;
1400		else
1401		if (tex->Texture == TGSI_TEXTURE_2D)
1402			*mask = 0x3;
1403
1404		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1405			*mask |= 0x8;
1406		break;
1407	default:
1408		break;
1409	}
1410}
1411
1412static void
1413prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1414		  unsigned *r_usage[2])
1415{
1416	const struct tgsi_full_instruction *insn;
1417	const struct tgsi_full_src_register *src;
1418	const struct tgsi_dst_register *dst;
1419
1420	unsigned i, c, k, n, mask, *acc_p;
1421
1422	insn = &tok->FullInstruction;
1423	dst = &insn->FullDstRegisters[0].DstRegister;
1424	mask = dst->WriteMask;
1425
1426	if (!r_usage[0])
1427		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1428	if (!r_usage[1])
1429		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1430
1431	if (dst->File == TGSI_FILE_TEMPORARY) {
1432		for (c = 0; c < 4; c++) {
1433			if (!(mask & (1 << c)))
1434				continue;
1435			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1436		}
1437	}
1438
1439	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1440		src = &insn->FullSrcRegisters[i];
1441
1442		switch (src->SrcRegister.File) {
1443		case TGSI_FILE_TEMPORARY:
1444			acc_p = r_usage[0];
1445			break;
1446		case TGSI_FILE_INPUT:
1447			acc_p = r_usage[1];
1448			break;
1449		default:
1450			continue;
1451		}
1452
1453		insn_adjust_mask(insn, &mask);
1454
1455		for (c = 0; c < 4; c++) {
1456			if (!(mask & (1 << c)))
1457				continue;
1458
1459			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1460			switch (k) {
1461			case TGSI_EXTSWIZZLE_X:
1462			case TGSI_EXTSWIZZLE_Y:
1463			case TGSI_EXTSWIZZLE_Z:
1464			case TGSI_EXTSWIZZLE_W:
1465				n = src->SrcRegister.Index * 4 + k;
1466				acc_p[n] = pc->insn_nr;
1467				break;
1468			default:
1469				break;
1470			}
1471		}
1472	}
1473}
1474
1475static unsigned
1476load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1477	       int *aid, int *p_oid)
1478{
1479	struct nv50_reg *iv;
1480	int oid, c, n;
1481	unsigned mask = 0;
1482
1483	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1484
1485	for (c = 0, n = i * 4; c < 4; c++, n++) {
1486		oid = (*p_oid)++;
1487		pc->attr[n].type = P_TEMP;
1488		pc->attr[n].index = i;
1489
1490		if (pc->attr[n].acc == acc[n])
1491			continue;
1492		mask |= (1 << c);
1493
1494		pc->attr[n].acc = acc[n];
1495		pc->attr[n].rhw = pc->attr[n].hw = -1;
1496		alloc_reg(pc, &pc->attr[n]);
1497
1498		pc->attr[n].rhw = (*aid)++;
1499		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1500
1501		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1502		(*mid)++;
1503		pc->p->cfg.fp.regs[1] += 0x00010001;
1504	}
1505
1506	return mask;
1507}
1508
1509static boolean
1510nv50_program_tx_prep(struct nv50_pc *pc)
1511{
1512	struct tgsi_parse_context p;
1513	boolean ret = FALSE;
1514	unsigned i, c;
1515	unsigned fcol, bcol, fcrd, depr;
1516
1517	/* count (centroid) perspective interpolations */
1518	unsigned centroid_loads = 0;
1519	unsigned perspect_loads = 0;
1520
1521	/* track register access for temps and attrs */
1522	unsigned *r_usage[2];
1523	r_usage[0] = NULL;
1524	r_usage[1] = NULL;
1525
1526	depr = fcol = bcol = fcrd = 0xffff;
1527
1528	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1529		pc->p->cfg.fp.regs[0] = 0x01000404;
1530		pc->p->cfg.fp.regs[1] = 0x00000400;
1531	}
1532
1533	tgsi_parse_init(&p, pc->p->pipe.tokens);
1534	while (!tgsi_parse_end_of_tokens(&p)) {
1535		const union tgsi_full_token *tok = &p.FullToken;
1536
1537		tgsi_parse_token(&p);
1538		switch (tok->Token.Type) {
1539		case TGSI_TOKEN_TYPE_IMMEDIATE:
1540		{
1541			const struct tgsi_full_immediate *imm =
1542				&p.FullToken.FullImmediate;
1543
1544			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1545				      imm->u.ImmediateFloat32[1].Float,
1546				      imm->u.ImmediateFloat32[2].Float,
1547				      imm->u.ImmediateFloat32[3].Float);
1548		}
1549			break;
1550		case TGSI_TOKEN_TYPE_DECLARATION:
1551		{
1552			const struct tgsi_full_declaration *d;
1553			unsigned last, first, mode;
1554
1555			d = &p.FullToken.FullDeclaration;
1556			first = d->DeclarationRange.First;
1557			last = d->DeclarationRange.Last;
1558
1559			switch (d->Declaration.File) {
1560			case TGSI_FILE_TEMPORARY:
1561				if (pc->temp_nr < (last + 1))
1562					pc->temp_nr = last + 1;
1563				break;
1564			case TGSI_FILE_OUTPUT:
1565				if (pc->result_nr < (last + 1))
1566					pc->result_nr = last + 1;
1567
1568				if (!d->Declaration.Semantic)
1569					break;
1570
1571				switch (d->Semantic.SemanticName) {
1572				case TGSI_SEMANTIC_POSITION:
1573					depr = first;
1574					pc->p->cfg.fp.regs[2] |= 0x00000100;
1575					pc->p->cfg.fp.regs[3] |= 0x00000011;
1576					break;
1577				default:
1578					break;
1579				}
1580
1581				break;
1582			case TGSI_FILE_INPUT:
1583			{
1584				if (pc->attr_nr < (last + 1))
1585					pc->attr_nr = last + 1;
1586
1587				if (pc->p->type != PIPE_SHADER_FRAGMENT)
1588					break;
1589
1590				switch (d->Declaration.Interpolate) {
1591				case TGSI_INTERPOLATE_CONSTANT:
1592					mode = INTERP_FLAT;
1593					break;
1594				case TGSI_INTERPOLATE_PERSPECTIVE:
1595					mode = INTERP_PERSPECTIVE;
1596					break;
1597				default:
1598					mode = INTERP_LINEAR;
1599					break;
1600				}
1601
1602				if (d->Declaration.Semantic) {
1603					switch (d->Semantic.SemanticName) {
1604					case TGSI_SEMANTIC_POSITION:
1605						fcrd = first;
1606						break;
1607					case TGSI_SEMANTIC_COLOR:
1608						fcol = first;
1609						mode = INTERP_PERSPECTIVE;
1610						break;
1611					case TGSI_SEMANTIC_BCOLOR:
1612						bcol = first;
1613						mode = INTERP_PERSPECTIVE;
1614						break;
1615					}
1616				}
1617
1618				if (d->Declaration.Centroid) {
1619					mode |= INTERP_CENTROID;
1620					if (mode & INTERP_PERSPECTIVE)
1621						centroid_loads++;
1622				} else
1623				if (mode & INTERP_PERSPECTIVE)
1624					perspect_loads++;
1625
1626				assert(last < 32);
1627				for (i = first; i <= last; i++)
1628					pc->interp_mode[i] = mode;
1629			}
1630				break;
1631			case TGSI_FILE_CONSTANT:
1632				if (pc->param_nr < (last + 1))
1633					pc->param_nr = last + 1;
1634				break;
1635			case TGSI_FILE_SAMPLER:
1636				break;
1637			default:
1638				NOUVEAU_ERR("bad decl file %d\n",
1639					    d->Declaration.File);
1640				goto out_err;
1641			}
1642		}
1643			break;
1644		case TGSI_TOKEN_TYPE_INSTRUCTION:
1645			pc->insn_nr++;
1646			prep_inspect_insn(pc, tok, r_usage);
1647			break;
1648		default:
1649			break;
1650		}
1651	}
1652
1653	if (pc->temp_nr) {
1654		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1655		if (!pc->temp)
1656			goto out_err;
1657
1658		for (i = 0; i < pc->temp_nr; i++) {
1659			for (c = 0; c < 4; c++) {
1660				pc->temp[i*4+c].type = P_TEMP;
1661				pc->temp[i*4+c].hw = -1;
1662				pc->temp[i*4+c].rhw = -1;
1663				pc->temp[i*4+c].index = i;
1664				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
1665			}
1666		}
1667	}
1668
1669	if (pc->attr_nr) {
1670		int oid = 4, mid = 4, aid = 0;
1671		/* oid = VP output id
1672		 * aid = FP attribute/interpolant id
1673		 * mid = VP output mapping field ID
1674		 */
1675
1676		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1677		if (!pc->attr)
1678			goto out_err;
1679
1680		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1681			/* position should be loaded first */
1682			if (fcrd != 0xffff) {
1683				unsigned mask;
1684				mid = 0;
1685				mask = load_fp_attrib(pc, fcrd, r_usage[1],
1686						      &mid, &aid, &oid);
1687				oid = 0;
1688				pc->p->cfg.fp.regs[1] |= (mask << 24);
1689				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
1690			}
1691			pc->p->cfg.fp.map[0] += 0x03020100;
1692
1693			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
1694
1695			if (perspect_loads) {
1696				pc->iv_p = alloc_temp(pc, NULL);
1697
1698				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
1699					pc->p->cfg.fp.regs[1] |= 0x08000000;
1700					pc->iv_p->rhw = aid++;
1701					emit_interp(pc, pc->iv_p, NULL,
1702						    INTERP_LINEAR);
1703					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
1704				} else {
1705					pc->iv_p->rhw = aid - 1;
1706					emit_flop(pc, 0, pc->iv_p,
1707						  &pc->attr[fcrd * 4 + 3]);
1708				}
1709			}
1710
1711			if (centroid_loads) {
1712				pc->iv_c = alloc_temp(pc, NULL);
1713				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
1714				emit_interp(pc, pc->iv_c, NULL,
1715					    INTERP_CENTROID);
1716				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
1717				pc->p->cfg.fp.regs[1] |= 0x08000000;
1718			}
1719
1720			for (c = 0; c < 4; c++) {
1721				/* I don't know what these values do, but
1722				 * let's set them like the blob does:
1723				 */
1724				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
1725					pc->p->cfg.fp.regs[0] += 0x00010000;
1726				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
1727					pc->p->cfg.fp.regs[0] += 0x00010000;
1728			}
1729
1730			for (i = 0; i < pc->attr_nr; i++)
1731				load_fp_attrib(pc, i, r_usage[1],
1732					       &mid, &aid, &oid);
1733
1734			if (pc->iv_p)
1735				free_temp(pc, pc->iv_p);
1736			if (pc->iv_c)
1737				free_temp(pc, pc->iv_c);
1738
1739			pc->p->cfg.fp.high_map = (mid / 4);
1740			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
1741		} else {
1742			/* vertex program */
1743			for (i = 0; i < pc->attr_nr * 4; i++) {
1744				pc->p->cfg.vp.attr[aid / 32] |=
1745					(1 << (aid % 32));
1746				pc->attr[i].type = P_ATTR;
1747				pc->attr[i].hw = aid++;
1748				pc->attr[i].index = i / 4;
1749			}
1750		}
1751	}
1752
1753	if (pc->result_nr) {
1754		int rid = 0;
1755
1756		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
1757		if (!pc->result)
1758			goto out_err;
1759
1760		for (i = 0; i < pc->result_nr; i++) {
1761			for (c = 0; c < 4; c++) {
1762				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1763					pc->result[i*4+c].type = P_TEMP;
1764					pc->result[i*4+c].hw = -1;
1765					pc->result[i*4+c].rhw = (i == depr) ?
1766						-1 : rid++;
1767				} else {
1768					pc->result[i*4+c].type = P_RESULT;
1769					pc->result[i*4+c].hw = rid++;
1770				}
1771				pc->result[i*4+c].index = i;
1772			}
1773
1774			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
1775			    depr != 0xffff) {
1776				pc->result[depr * 4 + 2].rhw =
1777					(pc->result_nr - 1) * 4;
1778			}
1779		}
1780	}
1781
1782	if (pc->param_nr) {
1783		int rid = 0;
1784
1785		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
1786		if (!pc->param)
1787			goto out_err;
1788
1789		for (i = 0; i < pc->param_nr; i++) {
1790			for (c = 0; c < 4; c++) {
1791				pc->param[i*4+c].type = P_CONST;
1792				pc->param[i*4+c].hw = rid++;
1793				pc->param[i*4+c].index = i;
1794			}
1795		}
1796	}
1797
1798	if (pc->immd_nr) {
1799		int rid = pc->param_nr * 4;
1800
1801		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
1802		if (!pc->immd)
1803			goto out_err;
1804
1805		for (i = 0; i < pc->immd_nr; i++) {
1806			for (c = 0; c < 4; c++) {
1807				pc->immd[i*4+c].type = P_IMMD;
1808				pc->immd[i*4+c].hw = rid++;
1809				pc->immd[i*4+c].index = i;
1810			}
1811		}
1812	}
1813
1814	ret = TRUE;
1815out_err:
1816	if (r_usage[0])
1817		FREE(r_usage[0]);
1818	if (r_usage[1])
1819		FREE(r_usage[1]);
1820
1821	tgsi_parse_free(&p);
1822	return ret;
1823}
1824
1825static void
1826free_nv50_pc(struct nv50_pc *pc)
1827{
1828	unsigned i;
1829
1830	if (pc->immd)
1831		FREE(pc->immd);
1832	if (pc->param)
1833		FREE(pc->param);
1834	if (pc->result)
1835		FREE(pc->result);
1836	if (pc->attr)
1837		FREE(pc->attr);
1838	if (pc->temp)
1839		FREE(pc->temp);
1840
1841	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
1842		/* deallocate fragment program attributes */
1843		if (pc->r_temp[i] && pc->r_temp[i]->index == -1)
1844			FREE(pc->r_temp[i]);
1845	}
1846
1847	FREE(pc);
1848}
1849
1850static boolean
1851nv50_program_tx(struct nv50_program *p)
1852{
1853	struct tgsi_parse_context parse;
1854	struct nv50_pc *pc;
1855	unsigned k;
1856	boolean ret;
1857
1858	pc = CALLOC_STRUCT(nv50_pc);
1859	if (!pc)
1860		return FALSE;
1861	pc->p = p;
1862	pc->p->cfg.high_temp = 4;
1863
1864	ret = nv50_program_tx_prep(pc);
1865	if (ret == FALSE)
1866		goto out_cleanup;
1867
1868	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1869	while (!tgsi_parse_end_of_tokens(&parse)) {
1870		const union tgsi_full_token *tok = &parse.FullToken;
1871
1872		tgsi_parse_token(&parse);
1873
1874		switch (tok->Token.Type) {
1875		case TGSI_TOKEN_TYPE_INSTRUCTION:
1876			++pc->insn_cur;
1877			ret = nv50_program_tx_insn(pc, tok);
1878			if (ret == FALSE)
1879				goto out_err;
1880			break;
1881		default:
1882			break;
1883		}
1884	}
1885
1886	if (p->type == PIPE_SHADER_FRAGMENT) {
1887		struct nv50_reg out;
1888
1889		out.type = P_TEMP;
1890		for (k = 0; k < pc->result_nr * 4; k++) {
1891			if (pc->result[k].rhw == -1)
1892				continue;
1893			if (pc->result[k].hw != pc->result[k].rhw) {
1894				out.hw = pc->result[k].rhw;
1895				emit_mov(pc, &out, &pc->result[k]);
1896			}
1897			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
1898				pc->p->cfg.high_result = pc->result[k].rhw + 1;
1899		}
1900	}
1901
1902	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
1903	pc->p->exec_tail->inst[1] |= 0x00000001;
1904
1905	p->param_nr = pc->param_nr * 4;
1906	p->immd_nr = pc->immd_nr * 4;
1907	p->immd = pc->immd_buf;
1908
1909out_err:
1910	tgsi_parse_free(&parse);
1911
1912out_cleanup:
1913	free_nv50_pc(pc);
1914	return ret;
1915}
1916
1917static void
1918nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1919{
1920	if (nv50_program_tx(p) == FALSE)
1921		assert(0);
1922	p->translated = TRUE;
1923}
1924
1925static void
1926nv50_program_upload_data(struct nv50_context *nv50, float *map,
1927			 unsigned start, unsigned count)
1928{
1929	struct nouveau_channel *chan = nv50->screen->nvws->channel;
1930	struct nouveau_grobj *tesla = nv50->screen->tesla;
1931
1932	while (count) {
1933		unsigned nr = count > 2047 ? 2047 : count;
1934
1935		BEGIN_RING(chan, tesla, 0x00000f00, 1);
1936		OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
1937		BEGIN_RING(chan, tesla, 0x40000f04, nr);
1938		OUT_RINGp (chan, map, nr);
1939
1940		map += nr;
1941		start += nr;
1942		count -= nr;
1943	}
1944}
1945
1946static void
1947nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1948{
1949	struct nouveau_winsys *nvws = nv50->screen->nvws;
1950	struct pipe_winsys *ws = nv50->pipe.winsys;
1951	unsigned nr = p->param_nr + p->immd_nr;
1952
1953	if (!p->data && nr) {
1954		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
1955
1956		if (nvws->res_alloc(heap, nr, p, &p->data)) {
1957			while (heap->next && heap->size < nr) {
1958				struct nv50_program *evict = heap->next->priv;
1959				nvws->res_free(&evict->data);
1960			}
1961
1962			if (nvws->res_alloc(heap, nr, p, &p->data))
1963				assert(0);
1964		}
1965	}
1966
1967	if (p->param_nr) {
1968		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
1969					    PIPE_BUFFER_USAGE_CPU_READ);
1970		nv50_program_upload_data(nv50, map, p->data->start,
1971					 p->param_nr);
1972		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
1973	}
1974
1975	if (p->immd_nr) {
1976		nv50_program_upload_data(nv50, p->immd,
1977					 p->data->start + p->param_nr,
1978					 p->immd_nr);
1979	}
1980}
1981
1982static void
1983nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1984{
1985	struct nouveau_channel *chan = nv50->screen->nvws->channel;
1986	struct nouveau_grobj *tesla = nv50->screen->tesla;
1987	struct pipe_screen *screen = nv50->pipe.screen;
1988	struct nv50_program_exec *e;
1989	struct nouveau_stateobj *so;
1990	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
1991	unsigned start, count, *up, *ptr;
1992	boolean upload = FALSE;
1993
1994	if (!p->buffer) {
1995		p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
1996		upload = TRUE;
1997	}
1998
1999	if (p->data && p->data->start != p->data_start) {
2000		for (e = p->exec_head; e; e = e->next) {
2001			unsigned ei, ci;
2002
2003			if (e->param.index < 0)
2004				continue;
2005			ei = e->param.shift >> 5;
2006			ci = e->param.index + p->data->start;
2007
2008			e->inst[ei] &= ~e->param.mask;
2009			e->inst[ei] |= (ci << e->param.shift);
2010		}
2011
2012		p->data_start = p->data->start;
2013		upload = TRUE;
2014	}
2015
2016	if (!upload)
2017		return;
2018
2019#ifdef NV50_PROGRAM_DUMP
2020	NOUVEAU_ERR("-------\n");
2021	for (e = p->exec_head; e; e = e->next) {
2022		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2023		if (is_long(e))
2024			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2025	}
2026#endif
2027
2028	up = ptr = MALLOC(p->exec_size * 4);
2029	for (e = p->exec_head; e; e = e->next) {
2030		*(ptr++) = e->inst[0];
2031		if (is_long(e))
2032			*(ptr++) = e->inst[1];
2033	}
2034
2035	so = so_new(4,2);
2036	so_method(so, nv50->screen->tesla, 0x1280, 3);
2037	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2038	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2039	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2040
2041	start = 0; count = p->exec_size;
2042	while (count) {
2043		struct nouveau_winsys *nvws = nv50->screen->nvws;
2044		unsigned nr;
2045
2046		so_emit(nvws, so);
2047
2048		nr = MIN2(count, 2047);
2049		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
2050		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
2051			FIRE_RING(chan);
2052			continue;
2053		}
2054
2055		BEGIN_RING(chan, tesla, 0x0f00, 1);
2056		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2057		BEGIN_RING(chan, tesla, 0x40000f04, nr);
2058		OUT_RINGp (chan, up + start, nr);
2059
2060		start += nr;
2061		count -= nr;
2062	}
2063
2064	FREE(up);
2065	so_ref(NULL, &so);
2066}
2067
2068void
2069nv50_vertprog_validate(struct nv50_context *nv50)
2070{
2071	struct nouveau_grobj *tesla = nv50->screen->tesla;
2072	struct nv50_program *p = nv50->vertprog;
2073	struct nouveau_stateobj *so;
2074
2075	if (!p->translated) {
2076		nv50_program_validate(nv50, p);
2077		if (!p->translated)
2078			assert(0);
2079	}
2080
2081	nv50_program_validate_data(nv50, p);
2082	nv50_program_validate_code(nv50, p);
2083
2084	so = so_new(13, 2);
2085	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2086	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2087		  NOUVEAU_BO_HIGH, 0, 0);
2088	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2089		  NOUVEAU_BO_LOW, 0, 0);
2090	so_method(so, tesla, 0x1650, 2);
2091	so_data  (so, p->cfg.vp.attr[0]);
2092	so_data  (so, p->cfg.vp.attr[1]);
2093	so_method(so, tesla, 0x16b8, 1);
2094	so_data  (so, p->cfg.high_result);
2095	so_method(so, tesla, 0x16ac, 2);
2096	so_data  (so, p->cfg.high_result); //8);
2097	so_data  (so, p->cfg.high_temp);
2098	so_method(so, tesla, 0x140c, 1);
2099	so_data  (so, 0); /* program start offset */
2100	so_ref(so, &nv50->state.vertprog);
2101	so_ref(NULL, &so);
2102}
2103
2104void
2105nv50_fragprog_validate(struct nv50_context *nv50)
2106{
2107	struct nouveau_grobj *tesla = nv50->screen->tesla;
2108	struct nv50_program *p = nv50->fragprog;
2109	struct nouveau_stateobj *so;
2110	unsigned i;
2111
2112	if (!p->translated) {
2113		nv50_program_validate(nv50, p);
2114		if (!p->translated)
2115			assert(0);
2116	}
2117
2118	nv50_program_validate_data(nv50, p);
2119	nv50_program_validate_code(nv50, p);
2120
2121	so = so_new(64, 2);
2122	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2123	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2124		  NOUVEAU_BO_HIGH, 0, 0);
2125	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2126		  NOUVEAU_BO_LOW, 0, 0);
2127	so_method(so, tesla, 0x1904, 4);
2128	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2129	so_data  (so, 0x00000004);
2130	so_data  (so, 0x00000000);
2131	so_data  (so, 0x00000000);
2132	so_method(so, tesla, 0x16bc, p->cfg.fp.high_map);
2133	for (i = 0; i < p->cfg.fp.high_map; i++)
2134		so_data(so, p->cfg.fp.map[i]);
2135	so_method(so, tesla, 0x1988, 2);
2136	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2137	so_data  (so, p->cfg.high_temp);
2138	so_method(so, tesla, 0x1298, 1);
2139	so_data  (so, p->cfg.high_result);
2140	so_method(so, tesla, 0x19a8, 1);
2141	so_data  (so, p->cfg.fp.regs[2]);
2142	so_method(so, tesla, 0x196c, 1);
2143	so_data  (so, p->cfg.fp.regs[3]);
2144	so_method(so, tesla, 0x1414, 1);
2145	so_data  (so, 0); /* program start offset */
2146	so_ref(so, &nv50->state.fragprog);
2147	so_ref(NULL, &so);
2148}
2149
2150void
2151nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2152{
2153	struct pipe_screen *pscreen = nv50->pipe.screen;
2154
2155	while (p->exec_head) {
2156		struct nv50_program_exec *e = p->exec_head;
2157
2158		p->exec_head = e->next;
2159		FREE(e);
2160	}
2161	p->exec_tail = NULL;
2162	p->exec_size = 0;
2163
2164	if (p->buffer)
2165		pipe_buffer_reference(&p->buffer, NULL);
2166
2167	nv50->screen->nvws->res_free(&p->data);
2168
2169	p->translated = 0;
2170}
2171
2172