nv50_program.c revision 2b963f5c723401aa2646bd48eefe065cd335e280
19681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd/*
29681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * Copyright 2008 Ben Skeggs
3bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
49681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * Permission is hereby granted, free of charge, to any person obtaining a
59681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * copy of this software and associated documentation files (the "Software"),
69681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * to deal in the Software without restriction, including without limitation
79681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * and/or sell copies of the Software, and to permit persons to whom the
99681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * Software is furnished to do so, subject to the following conditions:
109681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd *
119681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * The above copyright notice and this permission notice shall be included in
129681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * all copies or substantial portions of the Software.
13bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
149681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
159681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
169681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
179681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
189681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
199681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
201a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering * SOFTWARE.
211a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering */
221a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering
231a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering#include "pipe/p_context.h"
249681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd#include "pipe/p_defines.h"
25bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "pipe/p_state.h"
269681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd#include "pipe/p_inlines.h"
279681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd
28bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "pipe/p_shader_tokens.h"
299681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd#include "tgsi/tgsi_parse.h"
30bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "tgsi/tgsi_util.h"
31bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering
32bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "nv50_context.h"
33bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering
34bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#define NV50_SU_MAX_TEMP 64
35bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering//#define NV50_PROGRAM_DUMP
36bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering
37bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering/* ARL - gallium craps itself on progs/vp/arl.txt
38bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
39bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * MSB - Like MAD, but MUL+SUB
40bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	- Fuck it off, introduce a way to negate args for ops that
41bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	  support it.
42bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
43bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * Look into inlining IMMD for ops other than MOV (make it general?)
44bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
47bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * case, if the emit_src() causes the inst to suddenly become long.
49bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
50bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * Verify half-insns work where expected - and force disable them where they
51bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * don't work - MUL has it forcibly disabled atm as it fixes POW..
52bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
53bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * FUCK! watch dst==src vectors, can overwrite components that are needed.
54ccfcd5c42c68752fbd6de318fe5ce4269f5a7c06Lennart Poettering * 	ie. SUB R0, R0.yzxw, R0
55bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
56bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * Things to check with renouveau:
57bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	FP attr/result assignment - how?
58bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 		attrib
59bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 			- 0x16bc maps vp output onto fp hpos
60bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 			- 0x16c0 maps vp output onto fp col0
61bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 		result
62bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 			- colr always 0-3
63bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 			- depr always 4
64bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x16bc->0x16e8 --> some binding between vp/fp regs
65bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x16b8 --> VP output count
66bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
67bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	      "MOV rcol.x, fcol.y" = 0x00000004
69bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x19a8 --> as above but 0x00000100 and 0x00000000
70bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	- 0x00100000 used when KIL used
71bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x196c --> as above but 0x00000011 and 0x00000000
72bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering *
73bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x1988 --> 0xXXNNNNNN
74bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 	- XX == FP high something
75bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering */
76bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poetteringstruct nv50_reg {
77bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	enum {
78bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering		P_TEMP,
79bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering		P_ATTR,
80bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering		P_RESULT,
81bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering		P_CONST,
82bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering		P_IMMD
83bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	} type;
84bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	int index;
85bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering
86bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	int hw;
87bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	int neg;
88bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering
899681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd	int rhw; /* result hw for FP outputs, or interpolant index */
909681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd	int acc; /* instruction where this reg is last read (first insn == 1) */
91bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering};
92bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering
939681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloydstruct nv50_pc {
94bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	struct nv50_program *p;
959681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd
96bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering	/* hw resources */
979681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
98
99	/* tgsi resources */
100	struct nv50_reg *temp;
101	int temp_nr;
102	struct nv50_reg *attr;
103	int attr_nr;
104	struct nv50_reg *result;
105	int result_nr;
106	struct nv50_reg *param;
107	int param_nr;
108	struct nv50_reg *immd;
109	float *immd_buf;
110	int immd_nr;
111
112	struct nv50_reg *temp_temp[16];
113	unsigned temp_temp_nr;
114
115	/* broadcast and destination replacement regs */
116	struct nv50_reg *r_brdc;
117	struct nv50_reg *r_dst[4];
118
119	unsigned interp_mode[32];
120	/* perspective interpolation registers */
121	struct nv50_reg *iv_p;
122	struct nv50_reg *iv_c;
123
124	/* current instruction and total number of insns */
125	unsigned insn_cur;
126	unsigned insn_nr;
127
128	boolean allow32;
129};
130
131static void
132alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
133{
134	int i = 0;
135
136	if (reg->type == P_RESULT) {
137		if (pc->p->cfg.high_result < (reg->hw + 1))
138			pc->p->cfg.high_result = reg->hw + 1;
139	}
140
141	if (reg->type != P_TEMP)
142		return;
143
144	if (reg->hw >= 0) {
145		/*XXX: do this here too to catch FP temp-as-attr usage..
146		 *     not clean, but works */
147		if (pc->p->cfg.high_temp < (reg->hw + 1))
148			pc->p->cfg.high_temp = reg->hw + 1;
149		return;
150	}
151
152	if (reg->rhw != -1) {
153		/* try to allocate temporary with index rhw first */
154		if (!(pc->r_temp[reg->rhw])) {
155			pc->r_temp[reg->rhw] = reg;
156			reg->hw = reg->rhw;
157			if (pc->p->cfg.high_temp < (reg->rhw + 1))
158				pc->p->cfg.high_temp = reg->rhw + 1;
159			return;
160		}
161		/* make sure we don't get things like $r0 needs to go
162		 * in $r1 and $r1 in $r0
163		 */
164		i = pc->result_nr * 4;
165	}
166
167	for (; i < NV50_SU_MAX_TEMP; i++) {
168		if (!(pc->r_temp[i])) {
169			pc->r_temp[i] = reg;
170			reg->hw = i;
171			if (pc->p->cfg.high_temp < (i + 1))
172				pc->p->cfg.high_temp = i + 1;
173			return;
174		}
175	}
176
177	assert(0);
178}
179
180static struct nv50_reg *
181alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
182{
183	struct nv50_reg *r;
184	int i;
185
186	if (dst && dst->type == P_TEMP && dst->hw == -1)
187		return dst;
188
189	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
190		if (!pc->r_temp[i]) {
191			r = CALLOC_STRUCT(nv50_reg);
192			r->type = P_TEMP;
193			r->index = -1;
194			r->hw = i;
195			r->rhw = -1;
196			pc->r_temp[i] = r;
197			return r;
198		}
199	}
200
201	assert(0);
202	return NULL;
203}
204
205/* Assign the hw of the discarded temporary register src
206 * to the tgsi register dst and free src.
207 */
208static void
209assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
210{
211	assert(src->index == -1 && src->hw != -1);
212
213	if (dst->hw != -1)
214		pc->r_temp[dst->hw] = NULL;
215	pc->r_temp[src->hw] = dst;
216	dst->hw = src->hw;
217
218	FREE(src);
219}
220
221/* release the hardware resource held by r */
222static void
223release_hw(struct nv50_pc *pc, struct nv50_reg *r)
224{
225	assert(r->type == P_TEMP);
226	if (r->hw == -1)
227		return;
228
229	assert(pc->r_temp[r->hw] == r);
230	pc->r_temp[r->hw] = NULL;
231
232	r->acc = 0;
233	if (r->index == -1)
234		FREE(r);
235}
236
237static void
238free_temp(struct nv50_pc *pc, struct nv50_reg *r)
239{
240	if (r->index == -1) {
241		unsigned hw = r->hw;
242
243		FREE(pc->r_temp[hw]);
244		pc->r_temp[hw] = NULL;
245	}
246}
247
248static int
249alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
250{
251	int i;
252
253	if ((idx + 4) >= NV50_SU_MAX_TEMP)
254		return 1;
255
256	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
257	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
258		return alloc_temp4(pc, dst, idx + 4);
259
260	for (i = 0; i < 4; i++) {
261		dst[i] = CALLOC_STRUCT(nv50_reg);
262		dst[i]->type = P_TEMP;
263		dst[i]->index = -1;
264		dst[i]->hw = idx + i;
265		pc->r_temp[idx + i] = dst[i];
266	}
267
268	return 0;
269}
270
271static void
272free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
273{
274	int i;
275
276	for (i = 0; i < 4; i++)
277		free_temp(pc, reg[i]);
278}
279
280static struct nv50_reg *
281temp_temp(struct nv50_pc *pc)
282{
283	if (pc->temp_temp_nr >= 16)
284		assert(0);
285
286	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
287	return pc->temp_temp[pc->temp_temp_nr++];
288}
289
290static void
291kill_temp_temp(struct nv50_pc *pc)
292{
293	int i;
294
295	for (i = 0; i < pc->temp_temp_nr; i++)
296		free_temp(pc, pc->temp_temp[i]);
297	pc->temp_temp_nr = 0;
298}
299
300static int
301ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
302{
303	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
304			       (pc->immd_nr + 1) * 4 * sizeof(float));
305	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
306	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
307	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
308	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
309
310	return pc->immd_nr++;
311}
312
313static struct nv50_reg *
314alloc_immd(struct nv50_pc *pc, float f)
315{
316	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
317	unsigned hw;
318
319	for (hw = 0; hw < pc->immd_nr * 4; hw++)
320		if (pc->immd_buf[hw] == f)
321			break;
322
323	if (hw == pc->immd_nr * 4)
324		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
325
326	r->type = P_IMMD;
327	r->hw = hw;
328	r->index = -1;
329	return r;
330}
331
332static struct nv50_program_exec *
333exec(struct nv50_pc *pc)
334{
335	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
336
337	e->param.index = -1;
338	return e;
339}
340
341static void
342emit(struct nv50_pc *pc, struct nv50_program_exec *e)
343{
344	struct nv50_program *p = pc->p;
345
346	if (p->exec_tail)
347		p->exec_tail->next = e;
348	if (!p->exec_head)
349		p->exec_head = e;
350	p->exec_tail = e;
351	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
352}
353
354static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
355
356static boolean
357is_long(struct nv50_program_exec *e)
358{
359	if (e->inst[0] & 1)
360		return TRUE;
361	return FALSE;
362}
363
364static boolean
365is_immd(struct nv50_program_exec *e)
366{
367	if (is_long(e) && (e->inst[1] & 3) == 3)
368		return TRUE;
369	return FALSE;
370}
371
372static INLINE void
373set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
374	 struct nv50_program_exec *e)
375{
376	set_long(pc, e);
377	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
378	e->inst[1] |= (pred << 7) | (idx << 12);
379}
380
381static INLINE void
382set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
383	    struct nv50_program_exec *e)
384{
385	set_long(pc, e);
386	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
387	e->inst[1] |= (idx << 4) | (on << 6);
388}
389
390static INLINE void
391set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
392{
393	if (is_long(e))
394		return;
395
396	e->inst[0] |= 1;
397	set_pred(pc, 0xf, 0, e);
398	set_pred_wr(pc, 0, 0, e);
399}
400
401static INLINE void
402set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
403{
404	if (dst->type == P_RESULT) {
405		set_long(pc, e);
406		e->inst[1] |= 0x00000008;
407	}
408
409	alloc_reg(pc, dst);
410	e->inst[0] |= (dst->hw << 2);
411}
412
413static INLINE void
414set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
415{
416	float f = pc->immd_buf[imm->hw];
417	unsigned val = fui(imm->neg ? -f : f);
418
419	set_long(pc, e);
420	/*XXX: can't be predicated - bits overlap.. catch cases where both
421	 *     are required and avoid them. */
422	set_pred(pc, 0, 0, e);
423	set_pred_wr(pc, 0, 0, e);
424
425	e->inst[1] |= 0x00000002 | 0x00000001;
426	e->inst[0] |= (val & 0x3f) << 16;
427	e->inst[1] |= (val >> 6) << 2;
428}
429
430
431#define INTERP_LINEAR		0
432#define INTERP_FLAT			1
433#define INTERP_PERSPECTIVE	2
434#define INTERP_CENTROID		4
435
436/* interpolant index has been stored in dst->rhw */
437static void
438emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
439		unsigned mode)
440{
441	assert(dst->rhw != -1);
442	struct nv50_program_exec *e = exec(pc);
443
444	e->inst[0] |= 0x80000000;
445	set_dst(pc, dst, e);
446	e->inst[0] |= (dst->rhw << 16);
447
448	if (mode & INTERP_FLAT) {
449		e->inst[0] |= (1 << 8);
450	} else {
451		if (mode & INTERP_PERSPECTIVE) {
452			e->inst[0] |= (1 << 25);
453			alloc_reg(pc, iv);
454			e->inst[0] |= (iv->hw << 9);
455		}
456
457		if (mode & INTERP_CENTROID)
458			e->inst[0] |= (1 << 24);
459	}
460
461	emit(pc, e);
462}
463
464static void
465set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
466	 struct nv50_program_exec *e)
467{
468	set_long(pc, e);
469
470	e->param.index = src->hw;
471	e->param.shift = s;
472	e->param.mask = m << (s % 32);
473
474	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
475}
476
477static void
478emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
479{
480	struct nv50_program_exec *e = exec(pc);
481
482	e->inst[0] |= 0x10000000;
483
484	set_dst(pc, dst, e);
485
486	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
487		set_immd(pc, src, e);
488		/*XXX: 32-bit, but steals part of "half" reg space - need to
489		 *     catch and handle this case if/when we do half-regs
490		 */
491	} else
492	if (src->type == P_IMMD || src->type == P_CONST) {
493		set_long(pc, e);
494		set_data(pc, src, 0x7f, 9, e);
495		e->inst[1] |= 0x20000000; /* src0 const? */
496	} else {
497		if (src->type == P_ATTR) {
498			set_long(pc, e);
499			e->inst[1] |= 0x00200000;
500		}
501
502		alloc_reg(pc, src);
503		e->inst[0] |= (src->hw << 9);
504	}
505
506	if (is_long(e) && !is_immd(e)) {
507		e->inst[1] |= 0x04000000; /* 32-bit */
508		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
509		if (!(e->inst[1] & 0x20000000))
510			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
511	} else
512		e->inst[0] |= 0x00008000;
513
514	emit(pc, e);
515}
516
517static INLINE void
518emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
519{
520	struct nv50_reg *imm = alloc_immd(pc, f);
521	emit_mov(pc, dst, imm);
522	FREE(imm);
523}
524
525static boolean
526check_swap_src_0_1(struct nv50_pc *pc,
527		   struct nv50_reg **s0, struct nv50_reg **s1)
528{
529	struct nv50_reg *src0 = *s0, *src1 = *s1;
530
531	if (src0->type == P_CONST) {
532		if (src1->type != P_CONST) {
533			*s0 = src1;
534			*s1 = src0;
535			return TRUE;
536		}
537	} else
538	if (src1->type == P_ATTR) {
539		if (src0->type != P_ATTR) {
540			*s0 = src1;
541			*s1 = src0;
542			return TRUE;
543		}
544	}
545
546	return FALSE;
547}
548
549static void
550set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
551{
552	if (src->type == P_ATTR) {
553		set_long(pc, e);
554		e->inst[1] |= 0x00200000;
555	} else
556	if (src->type == P_CONST || src->type == P_IMMD) {
557		struct nv50_reg *temp = temp_temp(pc);
558
559		emit_mov(pc, temp, src);
560		src = temp;
561	}
562
563	alloc_reg(pc, src);
564	e->inst[0] |= (src->hw << 9);
565}
566
567static void
568set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
569{
570	if (src->type == P_ATTR) {
571		struct nv50_reg *temp = temp_temp(pc);
572
573		emit_mov(pc, temp, src);
574		src = temp;
575	} else
576	if (src->type == P_CONST || src->type == P_IMMD) {
577		assert(!(e->inst[0] & 0x00800000));
578		if (e->inst[0] & 0x01000000) {
579			struct nv50_reg *temp = temp_temp(pc);
580
581			emit_mov(pc, temp, src);
582			src = temp;
583		} else {
584			set_data(pc, src, 0x7f, 16, e);
585			e->inst[0] |= 0x00800000;
586		}
587	}
588
589	alloc_reg(pc, src);
590	e->inst[0] |= (src->hw << 16);
591}
592
593static void
594set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
595{
596	set_long(pc, e);
597
598	if (src->type == P_ATTR) {
599		struct nv50_reg *temp = temp_temp(pc);
600
601		emit_mov(pc, temp, src);
602		src = temp;
603	} else
604	if (src->type == P_CONST || src->type == P_IMMD) {
605		assert(!(e->inst[0] & 0x01000000));
606		if (e->inst[0] & 0x00800000) {
607			struct nv50_reg *temp = temp_temp(pc);
608
609			emit_mov(pc, temp, src);
610			src = temp;
611		} else {
612			set_data(pc, src, 0x7f, 32+14, e);
613			e->inst[0] |= 0x01000000;
614		}
615	}
616
617	alloc_reg(pc, src);
618	e->inst[1] |= (src->hw << 14);
619}
620
621static void
622emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
623	 struct nv50_reg *src1)
624{
625	struct nv50_program_exec *e = exec(pc);
626
627	e->inst[0] |= 0xc0000000;
628
629	if (!pc->allow32)
630		set_long(pc, e);
631
632	check_swap_src_0_1(pc, &src0, &src1);
633	set_dst(pc, dst, e);
634	set_src_0(pc, src0, e);
635	if (src1->type == P_IMMD && !is_long(e)) {
636		if (src0->neg)
637			e->inst[0] |= 0x00008000;
638		set_immd(pc, src1, e);
639	} else {
640		set_src_1(pc, src1, e);
641		if (src0->neg ^ src1->neg) {
642			if (is_long(e))
643				e->inst[1] |= 0x08000000;
644			else
645				e->inst[0] |= 0x00008000;
646		}
647	}
648
649	emit(pc, e);
650}
651
652static void
653emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
654	 struct nv50_reg *src0, struct nv50_reg *src1)
655{
656	struct nv50_program_exec *e = exec(pc);
657
658	e->inst[0] |= 0xb0000000;
659
660	check_swap_src_0_1(pc, &src0, &src1);
661
662	if (!pc->allow32 || src0->neg || src1->neg) {
663		set_long(pc, e);
664		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
665	}
666
667	set_dst(pc, dst, e);
668	set_src_0(pc, src0, e);
669	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
670		set_src_2(pc, src1, e);
671	else
672	if (src1->type == P_IMMD)
673		set_immd(pc, src1, e);
674	else
675		set_src_1(pc, src1, e);
676
677	emit(pc, e);
678}
679
680static void
681emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
682	    struct nv50_reg *src0, struct nv50_reg *src1)
683{
684	struct nv50_program_exec *e = exec(pc);
685
686	set_long(pc, e);
687	e->inst[0] |= 0xb0000000;
688	e->inst[1] |= (sub << 29);
689
690	check_swap_src_0_1(pc, &src0, &src1);
691	set_dst(pc, dst, e);
692	set_src_0(pc, src0, e);
693	set_src_1(pc, src1, e);
694
695	emit(pc, e);
696}
697
698static INLINE void
699emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
700	 struct nv50_reg *src1)
701{
702	src1->neg ^= 1;
703	emit_add(pc, dst, src0, src1);
704	src1->neg ^= 1;
705}
706
707static void
708emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
709	 struct nv50_reg *src1, struct nv50_reg *src2)
710{
711	struct nv50_program_exec *e = exec(pc);
712
713	e->inst[0] |= 0xe0000000;
714
715	check_swap_src_0_1(pc, &src0, &src1);
716	set_dst(pc, dst, e);
717	set_src_0(pc, src0, e);
718	set_src_1(pc, src1, e);
719	set_src_2(pc, src2, e);
720
721	if (src0->neg ^ src1->neg)
722		e->inst[1] |= 0x04000000;
723	if (src2->neg)
724		e->inst[1] |= 0x08000000;
725
726	emit(pc, e);
727}
728
729static INLINE void
730emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
731	 struct nv50_reg *src1, struct nv50_reg *src2)
732{
733	src2->neg ^= 1;
734	emit_mad(pc, dst, src0, src1, src2);
735	src2->neg ^= 1;
736}
737
738static void
739emit_flop(struct nv50_pc *pc, unsigned sub,
740	  struct nv50_reg *dst, struct nv50_reg *src)
741{
742	struct nv50_program_exec *e = exec(pc);
743
744	e->inst[0] |= 0x90000000;
745	if (sub) {
746		set_long(pc, e);
747		e->inst[1] |= (sub << 29);
748	}
749
750	set_dst(pc, dst, e);
751	set_src_0(pc, src, e);
752
753	emit(pc, e);
754}
755
756static void
757emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
758{
759	struct nv50_program_exec *e = exec(pc);
760
761	e->inst[0] |= 0xb0000000;
762
763	set_dst(pc, dst, e);
764	set_src_0(pc, src, e);
765	set_long(pc, e);
766	e->inst[1] |= (6 << 29) | 0x00004000;
767
768	emit(pc, e);
769}
770
771static void
772emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
773{
774	struct nv50_program_exec *e = exec(pc);
775
776	e->inst[0] |= 0xb0000000;
777
778	set_dst(pc, dst, e);
779	set_src_0(pc, src, e);
780	set_long(pc, e);
781	e->inst[1] |= (6 << 29);
782
783	emit(pc, e);
784}
785
786#define CVTOP_RN	0x01
787#define CVTOP_FLOOR	0x03
788#define CVTOP_CEIL	0x05
789#define CVTOP_TRUNC	0x07
790#define CVTOP_SAT	0x08
791#define CVTOP_ABS	0x10
792
793/* 0x04 == 32 bit */
794/* 0x40 == dst is float */
795/* 0x80 == src is float */
796#define CVT_F32_F32 0xc4
797#define CVT_F32_S32 0x44
798#define CVT_F32_U32 0x64
799#define CVT_S32_F32 0x8c
800#define CVT_S32_S32 0x0c
801#define CVT_F32_F32_ROP 0xcc
802
803static void
804emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
805	 int wp, unsigned cvn, unsigned fmt)
806{
807	struct nv50_program_exec *e;
808
809	e = exec(pc);
810	set_long(pc, e);
811
812	e->inst[0] |= 0xa0000000;
813	e->inst[1] |= 0x00004000;
814	e->inst[1] |= (cvn << 16);
815	e->inst[1] |= (fmt << 24);
816	set_src_0(pc, src, e);
817
818	if (wp >= 0)
819		set_pred_wr(pc, 1, wp, e);
820
821	if (dst)
822		set_dst(pc, dst, e);
823	else {
824		e->inst[0] |= 0x000001fc;
825		e->inst[1] |= 0x00000008;
826	}
827
828	emit(pc, e);
829}
830
831/* nv50 Condition codes:
832 *  0x1 = LT
833 *  0x2 = EQ
834 *  0x3 = LE
835 *  0x4 = GT
836 *  0x5 = NE
837 *  0x6 = GE
838 *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
839 *  0x8 = unordered bit (allows NaN)
840 */
841static void
842emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
843	 struct nv50_reg *src0, struct nv50_reg *src1)
844{
845	struct nv50_program_exec *e = exec(pc);
846	struct nv50_reg *rdst;
847
848	assert(ccode < 16);
849	if (check_swap_src_0_1(pc, &src0, &src1))
850		ccode = ccode ^ 0x7;
851
852	rdst = dst;
853	if (dst && dst->type != P_TEMP)
854		dst = alloc_temp(pc, NULL);
855
856	/* set.u32 */
857	set_long(pc, e);
858	e->inst[0] |= 0xb0000000;
859	e->inst[1] |= 0x60000000 | (ccode << 14);
860
861	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
862	 * that doesn't seem to match what the hw actually does
863	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
864	 */
865
866	if (wp >= 0)
867		set_pred_wr(pc, 1, wp, e);
868	if (dst)
869		set_dst(pc, dst, e);
870	else {
871		e->inst[0] |= 0x000001fc;
872		e->inst[1] |= 0x00000008;
873	}
874
875	set_src_0(pc, src0, e);
876	set_src_1(pc, src1, e);
877
878	emit(pc, e);
879
880	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
881	if (rdst)
882		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
883	if (rdst && rdst != dst)
884		free_temp(pc, dst);
885}
886
887static INLINE unsigned
888map_tgsi_setop_cc(unsigned op)
889{
890	switch (op) {
891	case TGSI_OPCODE_SLT: return 0x1;
892	case TGSI_OPCODE_SGE: return 0x6;
893	case TGSI_OPCODE_SEQ: return 0x2;
894	case TGSI_OPCODE_SGT: return 0x4;
895	case TGSI_OPCODE_SLE: return 0x3;
896	case TGSI_OPCODE_SNE: return 0xd;
897	default:
898		assert(0);
899		return 0;
900	}
901}
902
903static INLINE void
904emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
905{
906	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
907}
908
909static void
910emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
911	 struct nv50_reg *v, struct nv50_reg *e)
912{
913	struct nv50_reg *temp = alloc_temp(pc, NULL);
914
915	emit_flop(pc, 3, temp, v);
916	emit_mul(pc, temp, temp, e);
917	emit_preex2(pc, temp, temp);
918	emit_flop(pc, 6, dst, temp);
919
920	free_temp(pc, temp);
921}
922
923static INLINE void
924emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
925{
926	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
927}
928
929static INLINE void
930emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
931{
932	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
933}
934
935static void
936emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
937	 struct nv50_reg **src)
938{
939	struct nv50_reg *one = alloc_immd(pc, 1.0);
940	struct nv50_reg *zero = alloc_immd(pc, 0.0);
941	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
942	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
943	struct nv50_reg *tmp[4];
944	boolean allow32 = pc->allow32;
945
946	pc->allow32 = FALSE;
947
948	if (mask & (3 << 1)) {
949		tmp[0] = alloc_temp(pc, NULL);
950		emit_minmax(pc, 4, tmp[0], src[0], zero);
951	}
952
953	if (mask & (1 << 2)) {
954		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
955
956		tmp[1] = temp_temp(pc);
957		emit_minmax(pc, 4, tmp[1], src[1], zero);
958
959		tmp[3] = temp_temp(pc);
960		emit_minmax(pc, 4, tmp[3], src[3], neg128);
961		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
962
963		emit_pow(pc, dst[2], tmp[1], tmp[3]);
964		emit_mov(pc, dst[2], zero);
965		set_pred(pc, 3, 0, pc->p->exec_tail);
966	}
967
968	if (mask & (1 << 1))
969		assimilate_temp(pc, dst[1], tmp[0]);
970	else
971	if (mask & (1 << 2))
972		free_temp(pc, tmp[0]);
973
974	pc->allow32 = allow32;
975
976	/* do this last, in case src[i,j] == dst[0,3] */
977	if (mask & (1 << 0))
978		emit_mov(pc, dst[0], one);
979
980	if (mask & (1 << 3))
981		emit_mov(pc, dst[3], one);
982
983	FREE(pos128);
984	FREE(neg128);
985	FREE(zero);
986	FREE(one);
987}
988
989static void
990emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
991{
992	struct nv50_program_exec *e = exec(pc);
993
994	set_long(pc, e);
995	e->inst[0] |= 0xa0000000; /* delta */
996	e->inst[1] |= (7 << 29); /* delta */
997	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
998	e->inst[1] |= (1 << 14); /* src .f32 */
999	set_dst(pc, dst, e);
1000	set_src_0(pc, src, e);
1001
1002	emit(pc, e);
1003}
1004
1005static void
1006emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
1007{
1008	struct nv50_program_exec *e;
1009	const int r_pred = 1;
1010
1011	/* Sets predicate reg ? */
1012	e = exec(pc);
1013	e->inst[0] = 0xa00001fd;
1014	e->inst[1] = 0xc4014788;
1015	set_src_0(pc, src, e);
1016	set_pred_wr(pc, 1, r_pred, e);
1017	if (src->neg)
1018		e->inst[1] |= 0x20000000;
1019	emit(pc, e);
1020
1021	/* This is probably KILP */
1022	e = exec(pc);
1023	e->inst[0] = 0x000001fe;
1024	set_long(pc, e);
1025	set_pred(pc, 1 /* LT? */, r_pred, e);
1026	emit(pc, e);
1027}
1028
1029static void
1030emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
1031	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
1032{
1033	struct nv50_reg *temp, *t[4];
1034	struct nv50_program_exec *e;
1035
1036	unsigned c, mode, dim;
1037
1038	switch (type) {
1039	case TGSI_TEXTURE_1D:
1040		dim = 1;
1041		break;
1042	case TGSI_TEXTURE_UNKNOWN:
1043	case TGSI_TEXTURE_2D:
1044	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
1045	case TGSI_TEXTURE_RECT:
1046		dim = 2;
1047		break;
1048	case TGSI_TEXTURE_3D:
1049	case TGSI_TEXTURE_CUBE:
1050	case TGSI_TEXTURE_SHADOW2D:
1051	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
1052		dim = 3;
1053		break;
1054	default:
1055		assert(0);
1056		break;
1057	}
1058
1059	/* some cards need t[0]'s hw index to be a multiple of 4 */
1060	alloc_temp4(pc, t, 0);
1061
1062	if (proj) {
1063		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
1064			mode = pc->interp_mode[src[0]->index];
1065
1066			t[3]->rhw = src[3]->rhw;
1067			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
1068			emit_flop(pc, 0, t[3], t[3]);
1069
1070			for (c = 0; c < dim; c++) {
1071				t[c]->rhw = src[c]->rhw;
1072				emit_interp(pc, t[c], t[3],
1073					    (mode | INTERP_PERSPECTIVE));
1074			}
1075		} else {
1076			emit_flop(pc, 0, t[3], src[3]);
1077			for (c = 0; c < dim; c++)
1078				emit_mul(pc, t[c], src[c], t[3]);
1079
1080			/* XXX: for some reason the blob sometimes uses MAD:
1081			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
1082			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
1083			 */
1084		}
1085	} else {
1086		if (type == TGSI_TEXTURE_CUBE) {
1087			temp = temp_temp(pc);
1088			emit_minmax(pc, 4, temp, src[0], src[1]);
1089			emit_minmax(pc, 4, temp, temp, src[2]);
1090			emit_flop(pc, 0, temp, temp);
1091			for (c = 0; c < 3; c++)
1092				emit_mul(pc, t[c], src[c], temp);
1093		} else {
1094			for (c = 0; c < dim; c++)
1095				emit_mov(pc, t[c], src[c]);
1096		}
1097	}
1098
1099	e = exec(pc);
1100	set_long(pc, e);
1101	e->inst[0] |= 0xf0000000;
1102	e->inst[1] |= 0x00000004;
1103	set_dst(pc, t[0], e);
1104	e->inst[0] |= (unit << 9);
1105
1106	if (dim == 2)
1107		e->inst[0] |= 0x00400000;
1108	else
1109	if (dim == 3)
1110		e->inst[0] |= 0x00800000;
1111
1112	e->inst[0] |= (mask & 0x3) << 25;
1113	e->inst[1] |= (mask & 0xc) << 12;
1114
1115	emit(pc, e);
1116
1117#if 1
1118	if (mask & 1) emit_mov(pc, dst[0], t[0]);
1119	if (mask & 2) emit_mov(pc, dst[1], t[1]);
1120	if (mask & 4) emit_mov(pc, dst[2], t[2]);
1121	if (mask & 8) emit_mov(pc, dst[3], t[3]);
1122
1123	free_temp4(pc, t);
1124#else
1125	/* XXX: if p.e. MUL is used directly after TEX, it would still use
1126	 * the texture coordinates, not the fetched values: latency ? */
1127
1128	for (c = 0; c < 4; c++) {
1129		if (mask & (1 << c))
1130			assimilate_temp(pc, dst[c], t[c]);
1131		else
1132			free_temp(pc, t[c]);
1133	}
1134#endif
1135}
1136
1137static void
1138convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
1139{
1140	unsigned q = 0, m = ~0;
1141
1142	assert(!is_long(e));
1143
1144	switch (e->inst[0] >> 28) {
1145	case 0x1:
1146		/* MOV */
1147		q = 0x0403c000;
1148		m = 0xffff7fff;
1149		break;
1150	case 0x8:
1151		/* INTERP (move centroid, perspective and flat bits) */
1152		m = ~0x03000100;
1153		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
1154		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
1155		break;
1156	case 0x9:
1157		/* RCP */
1158		break;
1159	case 0xB:
1160		/* ADD */
1161		m = ~(127 << 16);
1162		q = ((e->inst[0] & (~m)) >> 2);
1163		break;
1164	case 0xC:
1165		/* MUL */
1166		m = ~0x00008000;
1167		q = ((e->inst[0] & (~m)) << 12);
1168		break;
1169	case 0xE:
1170		/* MAD (if src2 == dst) */
1171		q = ((e->inst[0] & 0x1fc) << 12);
1172		break;
1173	default:
1174		assert(0);
1175		break;
1176	}
1177
1178	set_long(pc, e);
1179	pc->p->exec_size++;
1180
1181	e->inst[0] &= m;
1182	e->inst[1] |= q;
1183}
1184
1185static boolean
1186negate_supported(const struct tgsi_full_instruction *insn, int i)
1187{
1188	switch (insn->Instruction.Opcode) {
1189	case TGSI_OPCODE_DP3:
1190	case TGSI_OPCODE_DP4:
1191	case TGSI_OPCODE_MUL:
1192	case TGSI_OPCODE_KIL:
1193	case TGSI_OPCODE_ADD:
1194	case TGSI_OPCODE_SUB:
1195	case TGSI_OPCODE_MAD:
1196		return TRUE;
1197	case TGSI_OPCODE_POW:
1198		return (i == 1) ? TRUE : FALSE;
1199	default:
1200		return FALSE;
1201	}
1202}
1203
1204/* Return a read mask for source registers deduced from opcode & write mask. */
1205static unsigned
1206nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
1207{
1208	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
1209
1210	switch (insn->Instruction.Opcode) {
1211	case TGSI_OPCODE_COS:
1212	case TGSI_OPCODE_SIN:
1213		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
1214	case TGSI_OPCODE_DP3:
1215		return 0x7;
1216	case TGSI_OPCODE_DP4:
1217	case TGSI_OPCODE_DPH:
1218	case TGSI_OPCODE_KIL: /* WriteMask ignored */
1219		return 0xf;
1220	case TGSI_OPCODE_DST:
1221		return mask & (c ? 0xa : 0x6);
1222	case TGSI_OPCODE_EX2:
1223	case TGSI_OPCODE_LG2:
1224	case TGSI_OPCODE_POW:
1225	case TGSI_OPCODE_RCP:
1226	case TGSI_OPCODE_RSQ:
1227	case TGSI_OPCODE_SCS:
1228		return 0x1;
1229	case TGSI_OPCODE_LIT:
1230		return 0xb;
1231	case TGSI_OPCODE_TEX:
1232	case TGSI_OPCODE_TXP:
1233	{
1234		const struct tgsi_instruction_ext_texture *tex;
1235
1236		assert(insn->Instruction.Extended);
1237		tex = &insn->InstructionExtTexture;
1238
1239		mask = 0x7;
1240		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
1241			mask |= 0x8;
1242
1243		switch (tex->Texture) {
1244		case TGSI_TEXTURE_1D:
1245			mask &= 0x9;
1246			break;
1247		case TGSI_TEXTURE_2D:
1248			mask &= 0xb;
1249			break;
1250		default:
1251			break;
1252		}
1253	}
1254		return mask;
1255	case TGSI_OPCODE_XPD:
1256		x = 0;
1257		if (mask & 1) x |= 0x6;
1258		if (mask & 2) x |= 0x5;
1259		if (mask & 4) x |= 0x3;
1260		return x;
1261	default:
1262		break;
1263	}
1264
1265	return mask;
1266}
1267
1268static struct nv50_reg *
1269tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
1270{
1271	switch (dst->DstRegister.File) {
1272	case TGSI_FILE_TEMPORARY:
1273		return &pc->temp[dst->DstRegister.Index * 4 + c];
1274	case TGSI_FILE_OUTPUT:
1275		return &pc->result[dst->DstRegister.Index * 4 + c];
1276	case TGSI_FILE_NULL:
1277		return NULL;
1278	default:
1279		break;
1280	}
1281
1282	return NULL;
1283}
1284
1285static struct nv50_reg *
1286tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
1287	 boolean neg)
1288{
1289	struct nv50_reg *r = NULL;
1290	struct nv50_reg *temp;
1291	unsigned sgn, c;
1292
1293	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
1294
1295	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
1296	switch (c) {
1297	case TGSI_EXTSWIZZLE_X:
1298	case TGSI_EXTSWIZZLE_Y:
1299	case TGSI_EXTSWIZZLE_Z:
1300	case TGSI_EXTSWIZZLE_W:
1301		switch (src->SrcRegister.File) {
1302		case TGSI_FILE_INPUT:
1303			r = &pc->attr[src->SrcRegister.Index * 4 + c];
1304			break;
1305		case TGSI_FILE_TEMPORARY:
1306			r = &pc->temp[src->SrcRegister.Index * 4 + c];
1307			break;
1308		case TGSI_FILE_CONSTANT:
1309			r = &pc->param[src->SrcRegister.Index * 4 + c];
1310			break;
1311		case TGSI_FILE_IMMEDIATE:
1312			r = &pc->immd[src->SrcRegister.Index * 4 + c];
1313			break;
1314		case TGSI_FILE_SAMPLER:
1315			break;
1316		default:
1317			assert(0);
1318			break;
1319		}
1320		break;
1321	case TGSI_EXTSWIZZLE_ZERO:
1322		r = alloc_immd(pc, 0.0);
1323		return r;
1324	case TGSI_EXTSWIZZLE_ONE:
1325		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
1326			return alloc_immd(pc, -1.0);
1327		return alloc_immd(pc, 1.0);
1328	default:
1329		assert(0);
1330		break;
1331	}
1332
1333	switch (sgn) {
1334	case TGSI_UTIL_SIGN_KEEP:
1335		break;
1336	case TGSI_UTIL_SIGN_CLEAR:
1337		temp = temp_temp(pc);
1338		emit_abs(pc, temp, r);
1339		r = temp;
1340		break;
1341	case TGSI_UTIL_SIGN_TOGGLE:
1342		if (neg)
1343			r->neg = 1;
1344		else {
1345			temp = temp_temp(pc);
1346			emit_neg(pc, temp, r);
1347			r = temp;
1348		}
1349		break;
1350	case TGSI_UTIL_SIGN_SET:
1351		temp = temp_temp(pc);
1352		emit_abs(pc, temp, r);
1353		if (neg)
1354			temp->neg = 1;
1355		else
1356			emit_neg(pc, temp, temp);
1357		r = temp;
1358		break;
1359	default:
1360		assert(0);
1361		break;
1362	}
1363
1364	return r;
1365}
1366
1367/* return TRUE for ops that produce only a single result */
1368static boolean
1369is_scalar_op(unsigned op)
1370{
1371	switch (op) {
1372	case TGSI_OPCODE_COS:
1373	case TGSI_OPCODE_DP2:
1374	case TGSI_OPCODE_DP3:
1375	case TGSI_OPCODE_DP4:
1376	case TGSI_OPCODE_DPH:
1377	case TGSI_OPCODE_EX2:
1378	case TGSI_OPCODE_LG2:
1379	case TGSI_OPCODE_POW:
1380	case TGSI_OPCODE_RCP:
1381	case TGSI_OPCODE_RSQ:
1382	case TGSI_OPCODE_SIN:
1383		/*
1384	case TGSI_OPCODE_KIL:
1385	case TGSI_OPCODE_LIT:
1386	case TGSI_OPCODE_SCS:
1387		*/
1388		return TRUE;
1389	default:
1390		return FALSE;
1391	}
1392}
1393
1394/* Returns a bitmask indicating which dst components depend
1395 * on source s, component c (reverse of nv50_tgsi_src_mask).
1396 */
1397static unsigned
1398nv50_tgsi_dst_revdep(unsigned op, int s, int c)
1399{
1400	if (is_scalar_op(op))
1401		return 0x1;
1402
1403	switch (op) {
1404	case TGSI_OPCODE_DST:
1405		return (1 << c) & (s ? 0xa : 0x6);
1406	case TGSI_OPCODE_XPD:
1407		switch (c) {
1408		case 0: return 0x6;
1409		case 1: return 0x5;
1410		case 2: return 0x3;
1411		case 3: return 0x0;
1412		default:
1413			assert(0);
1414			return 0x0;
1415		}
1416	case TGSI_OPCODE_LIT:
1417	case TGSI_OPCODE_SCS:
1418	case TGSI_OPCODE_TEX:
1419	case TGSI_OPCODE_TXP:
1420		/* these take care of dangerous swizzles themselves */
1421		return 0x0;
1422	case TGSI_OPCODE_IF:
1423	case TGSI_OPCODE_KIL:
1424		/* don't call this function for these ops */
1425		assert(0);
1426		return 0;
1427	default:
1428		/* linear vector instruction */
1429		return (1 << c);
1430	}
1431}
1432
1433static boolean
1434nv50_program_tx_insn(struct nv50_pc *pc,
1435		     const struct tgsi_full_instruction *inst)
1436{
1437	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
1438	unsigned mask, sat, unit;
1439	int i, c;
1440
1441	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
1442	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
1443
1444	memset(src, 0, sizeof(src));
1445
1446	for (c = 0; c < 4; c++) {
1447		if ((mask & (1 << c)) && !pc->r_dst[c])
1448			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
1449		else
1450			dst[c] = pc->r_dst[c];
1451		rdst[c] = dst[c];
1452	}
1453
1454	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1455		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
1456		unsigned src_mask;
1457		boolean neg_supp;
1458
1459		src_mask = nv50_tgsi_src_mask(inst, i);
1460		neg_supp = negate_supported(inst, i);
1461
1462		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
1463			unit = fs->SrcRegister.Index;
1464
1465		for (c = 0; c < 4; c++)
1466			if (src_mask & (1 << c))
1467				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
1468	}
1469
1470	brdc = temp = pc->r_brdc;
1471	if (brdc && brdc->type != P_TEMP) {
1472		temp = temp_temp(pc);
1473		if (sat)
1474			brdc = temp;
1475	} else
1476	if (sat) {
1477		for (c = 0; c < 4; c++) {
1478			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
1479				continue;
1480			rdst[c] = dst[c];
1481			dst[c] = temp_temp(pc);
1482		}
1483	}
1484
1485	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
1486
1487	switch (inst->Instruction.Opcode) {
1488	case TGSI_OPCODE_ABS:
1489		for (c = 0; c < 4; c++) {
1490			if (!(mask & (1 << c)))
1491				continue;
1492			emit_abs(pc, dst[c], src[0][c]);
1493		}
1494		break;
1495	case TGSI_OPCODE_ADD:
1496		for (c = 0; c < 4; c++) {
1497			if (!(mask & (1 << c)))
1498				continue;
1499			emit_add(pc, dst[c], src[0][c], src[1][c]);
1500		}
1501		break;
1502	case TGSI_OPCODE_COS:
1503		if (mask & 8) {
1504			emit_precossin(pc, temp, src[0][3]);
1505			emit_flop(pc, 5, dst[3], temp);
1506			if (!(mask &= 7))
1507				break;
1508			if (temp == dst[3])
1509				temp = brdc = temp_temp(pc);
1510		}
1511		emit_precossin(pc, temp, src[0][0]);
1512		emit_flop(pc, 5, brdc, temp);
1513		break;
1514	case TGSI_OPCODE_DP3:
1515		emit_mul(pc, temp, src[0][0], src[1][0]);
1516		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1517		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
1518		break;
1519	case TGSI_OPCODE_DP4:
1520		emit_mul(pc, temp, src[0][0], src[1][0]);
1521		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1522		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1523		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
1524		break;
1525	case TGSI_OPCODE_DPH:
1526		emit_mul(pc, temp, src[0][0], src[1][0]);
1527		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1528		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1529		emit_add(pc, brdc, src[1][3], temp);
1530		break;
1531	case TGSI_OPCODE_DST:
1532		if (mask & (1 << 1))
1533			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1534		if (mask & (1 << 2))
1535			emit_mov(pc, dst[2], src[0][2]);
1536		if (mask & (1 << 3))
1537			emit_mov(pc, dst[3], src[1][3]);
1538		if (mask & (1 << 0))
1539			emit_mov_immdval(pc, dst[0], 1.0f);
1540		break;
1541	case TGSI_OPCODE_EX2:
1542		emit_preex2(pc, temp, src[0][0]);
1543		emit_flop(pc, 6, brdc, temp);
1544		break;
1545	case TGSI_OPCODE_FLR:
1546		for (c = 0; c < 4; c++) {
1547			if (!(mask & (1 << c)))
1548				continue;
1549			emit_flr(pc, dst[c], src[0][c]);
1550		}
1551		break;
1552	case TGSI_OPCODE_FRC:
1553		temp = temp_temp(pc);
1554		for (c = 0; c < 4; c++) {
1555			if (!(mask & (1 << c)))
1556				continue;
1557			emit_flr(pc, temp, src[0][c]);
1558			emit_sub(pc, dst[c], src[0][c], temp);
1559		}
1560		break;
1561	case TGSI_OPCODE_KIL:
1562		emit_kil(pc, src[0][0]);
1563		emit_kil(pc, src[0][1]);
1564		emit_kil(pc, src[0][2]);
1565		emit_kil(pc, src[0][3]);
1566		pc->p->cfg.fp.regs[2] |= 0x00100000;
1567		break;
1568	case TGSI_OPCODE_LIT:
1569		emit_lit(pc, &dst[0], mask, &src[0][0]);
1570		break;
1571	case TGSI_OPCODE_LG2:
1572		emit_flop(pc, 3, brdc, src[0][0]);
1573		break;
1574	case TGSI_OPCODE_LRP:
1575		temp = temp_temp(pc);
1576		for (c = 0; c < 4; c++) {
1577			if (!(mask & (1 << c)))
1578				continue;
1579			emit_sub(pc, temp, src[1][c], src[2][c]);
1580			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1581		}
1582		break;
1583	case TGSI_OPCODE_MAD:
1584		for (c = 0; c < 4; c++) {
1585			if (!(mask & (1 << c)))
1586				continue;
1587			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1588		}
1589		break;
1590	case TGSI_OPCODE_MAX:
1591		for (c = 0; c < 4; c++) {
1592			if (!(mask & (1 << c)))
1593				continue;
1594			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1595		}
1596		break;
1597	case TGSI_OPCODE_MIN:
1598		for (c = 0; c < 4; c++) {
1599			if (!(mask & (1 << c)))
1600				continue;
1601			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1602		}
1603		break;
1604	case TGSI_OPCODE_MOV:
1605	case TGSI_OPCODE_SWZ:
1606		for (c = 0; c < 4; c++) {
1607			if (!(mask & (1 << c)))
1608				continue;
1609			emit_mov(pc, dst[c], src[0][c]);
1610		}
1611		break;
1612	case TGSI_OPCODE_MUL:
1613		for (c = 0; c < 4; c++) {
1614			if (!(mask & (1 << c)))
1615				continue;
1616			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1617		}
1618		break;
1619	case TGSI_OPCODE_POW:
1620		emit_pow(pc, brdc, src[0][0], src[1][0]);
1621		break;
1622	case TGSI_OPCODE_RCP:
1623		emit_flop(pc, 0, brdc, src[0][0]);
1624		break;
1625	case TGSI_OPCODE_RSQ:
1626		emit_flop(pc, 2, brdc, src[0][0]);
1627		break;
1628	case TGSI_OPCODE_SCS:
1629		temp = temp_temp(pc);
1630		if (mask & 3)
1631			emit_precossin(pc, temp, src[0][0]);
1632		if (mask & (1 << 0))
1633			emit_flop(pc, 5, dst[0], temp);
1634		if (mask & (1 << 1))
1635			emit_flop(pc, 4, dst[1], temp);
1636		if (mask & (1 << 2))
1637			emit_mov_immdval(pc, dst[2], 0.0);
1638		if (mask & (1 << 3))
1639			emit_mov_immdval(pc, dst[3], 1.0);
1640		break;
1641	case TGSI_OPCODE_SIN:
1642		if (mask & 8) {
1643			emit_precossin(pc, temp, src[0][3]);
1644			emit_flop(pc, 4, dst[3], temp);
1645			if (!(mask &= 7))
1646				break;
1647			if (temp == dst[3])
1648				temp = brdc = temp_temp(pc);
1649		}
1650		emit_precossin(pc, temp, src[0][0]);
1651		emit_flop(pc, 4, brdc, temp);
1652		break;
1653	case TGSI_OPCODE_SLT:
1654	case TGSI_OPCODE_SGE:
1655	case TGSI_OPCODE_SEQ:
1656	case TGSI_OPCODE_SGT:
1657	case TGSI_OPCODE_SLE:
1658	case TGSI_OPCODE_SNE:
1659		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
1660		for (c = 0; c < 4; c++) {
1661			if (!(mask & (1 << c)))
1662				continue;
1663			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
1664		}
1665		break;
1666	case TGSI_OPCODE_SUB:
1667		for (c = 0; c < 4; c++) {
1668			if (!(mask & (1 << c)))
1669				continue;
1670			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1671		}
1672		break;
1673	case TGSI_OPCODE_TEX:
1674		emit_tex(pc, dst, mask, src[0], unit,
1675			 inst->InstructionExtTexture.Texture, FALSE);
1676		break;
1677	case TGSI_OPCODE_TXP:
1678		emit_tex(pc, dst, mask, src[0], unit,
1679			 inst->InstructionExtTexture.Texture, TRUE);
1680		break;
1681	case TGSI_OPCODE_XPD:
1682		temp = temp_temp(pc);
1683		if (mask & (1 << 0)) {
1684			emit_mul(pc, temp, src[0][2], src[1][1]);
1685			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1686		}
1687		if (mask & (1 << 1)) {
1688			emit_mul(pc, temp, src[0][0], src[1][2]);
1689			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1690		}
1691		if (mask & (1 << 2)) {
1692			emit_mul(pc, temp, src[0][1], src[1][0]);
1693			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1694		}
1695		if (mask & (1 << 3))
1696			emit_mov_immdval(pc, dst[3], 1.0);
1697		break;
1698	case TGSI_OPCODE_END:
1699		break;
1700	default:
1701		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1702		return FALSE;
1703	}
1704
1705	if (brdc) {
1706		if (sat)
1707			emit_sat(pc, brdc, brdc);
1708		for (c = 0; c < 4; c++)
1709			if ((mask & (1 << c)) && dst[c] != brdc)
1710				emit_mov(pc, dst[c], brdc);
1711	} else
1712	if (sat) {
1713		for (c = 0; c < 4; c++) {
1714			if (!(mask & (1 << c)))
1715				continue;
1716			/* in this case we saturate later */
1717			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
1718				continue;
1719			emit_sat(pc, rdst[c], dst[c]);
1720		}
1721	}
1722
1723	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1724		for (c = 0; c < 4; c++) {
1725			if (!src[i][c])
1726				continue;
1727			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1728				FREE(src[i][c]);
1729		}
1730	}
1731
1732	kill_temp_temp(pc);
1733	return TRUE;
1734}
1735
1736static void
1737prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
1738		  unsigned *r_usage[2])
1739{
1740	const struct tgsi_full_instruction *insn;
1741	const struct tgsi_full_src_register *src;
1742	const struct tgsi_dst_register *dst;
1743
1744	unsigned i, c, k, n, mask, *acc_p;
1745
1746	insn = &tok->FullInstruction;
1747	dst = &insn->FullDstRegisters[0].DstRegister;
1748	mask = dst->WriteMask;
1749
1750	if (!r_usage[0])
1751		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
1752	if (!r_usage[1])
1753		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
1754
1755	if (dst->File == TGSI_FILE_TEMPORARY) {
1756		for (c = 0; c < 4; c++) {
1757			if (!(mask & (1 << c)))
1758				continue;
1759			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
1760		}
1761	}
1762
1763	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1764		src = &insn->FullSrcRegisters[i];
1765
1766		switch (src->SrcRegister.File) {
1767		case TGSI_FILE_TEMPORARY:
1768			acc_p = r_usage[0];
1769			break;
1770		case TGSI_FILE_INPUT:
1771			acc_p = r_usage[1];
1772			break;
1773		default:
1774			continue;
1775		}
1776
1777		mask = nv50_tgsi_src_mask(insn, i);
1778
1779		for (c = 0; c < 4; c++) {
1780			if (!(mask & (1 << c)))
1781				continue;
1782
1783			k = tgsi_util_get_full_src_register_extswizzle(src, c);
1784			switch (k) {
1785			case TGSI_EXTSWIZZLE_X:
1786			case TGSI_EXTSWIZZLE_Y:
1787			case TGSI_EXTSWIZZLE_Z:
1788			case TGSI_EXTSWIZZLE_W:
1789				n = src->SrcRegister.Index * 4 + k;
1790				acc_p[n] = pc->insn_nr;
1791				break;
1792			default:
1793				break;
1794			}
1795		}
1796	}
1797}
1798
1799/* Returns a bitmask indicating which dst components need to be
1800 * written to temporaries first to avoid 'corrupting' sources.
1801 *
1802 * m[i]   (out) indicate component to write in the i-th position
1803 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
1804 */
1805static unsigned
1806nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
1807{
1808	unsigned i, c, x, unsafe;
1809
1810	for (c = 0; c < 4; c++)
1811		m[c] = c;
1812
1813	/* Swap as long as a dst component written earlier is depended on
1814	 * by one written later, but the next one isn't depended on by it.
1815	 */
1816	for (c = 0; c < 3; c++) {
1817		if (rdep[m[c + 1]] & (1 << m[c]))
1818			continue; /* if next one is depended on by us */
1819		for (i = c + 1; i < 4; i++)
1820			/* if we are depended on by a later one */
1821			if (rdep[m[c]] & (1 << m[i]))
1822				break;
1823		if (i == 4)
1824			continue;
1825		/* now, swap */
1826		x = m[c];
1827		m[c] = m[c + 1];
1828		m[c + 1] = x;
1829
1830		/* restart */
1831		c = 0;
1832	}
1833
1834	/* mark dependencies that could not be resolved by reordering */
1835	for (i = 0; i < 3; ++i)
1836		for (c = i + 1; c < 4; ++c)
1837			if (rdep[m[i]] & (1 << m[c]))
1838				unsafe |= (1 << i);
1839
1840	/* NOTE: $unsafe is with respect to order, not component */
1841	return unsafe;
1842}
1843
1844/* Select a suitable dst register for broadcasting scalar results,
1845 * or return NULL if we have to allocate an extra TEMP.
1846 *
1847 * If e.g. only 1 component is written, we may also emit the final
1848 * result to a write-only register.
1849 */
1850static struct nv50_reg *
1851tgsi_broadcast_dst(struct nv50_pc *pc,
1852		   const struct tgsi_full_dst_register *fd, unsigned mask)
1853{
1854	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
1855		int c = ffs(~mask & fd->DstRegister.WriteMask);
1856		if (c)
1857			return tgsi_dst(pc, c - 1, fd);
1858	} else {
1859		int c = ffs(fd->DstRegister.WriteMask) - 1;
1860		if ((1 << c) == fd->DstRegister.WriteMask)
1861			return tgsi_dst(pc, c, fd);
1862	}
1863
1864	return NULL;
1865}
1866
1867/* Scan source swizzles and return a bitmask indicating dst regs that
1868 * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
1869 */
1870static unsigned
1871nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
1872		       unsigned rdep[4])
1873{
1874	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
1875	const struct tgsi_full_src_register *fs;
1876	unsigned i, deqs = 0;
1877
1878	for (i = 0; i < 4; ++i)
1879		rdep[i] = 0;
1880
1881	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
1882		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
1883		boolean neg_supp = negate_supported(insn, i);
1884
1885		fs = &insn->FullSrcRegisters[i];
1886		if (fs->SrcRegister.File != fd->DstRegister.File ||
1887		    fs->SrcRegister.Index != fd->DstRegister.Index)
1888			continue;
1889
1890		for (chn = 0; chn < 4; ++chn) {
1891			unsigned s, c;
1892
1893			if (!(mask & (1 << chn))) /* src is not read */
1894				continue;
1895			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
1896			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
1897
1898			if (c > TGSI_EXTSWIZZLE_W ||
1899			    !(fd->DstRegister.WriteMask & (1 << c)))
1900				continue;
1901
1902			/* no danger if src is copied to TEMP first */
1903			if ((s != TGSI_UTIL_SIGN_KEEP) &&
1904			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
1905				continue;
1906
1907			rdep[c] |= nv50_tgsi_dst_revdep(
1908				insn->Instruction.Opcode, i, chn);
1909			deqs |= (1 << c);
1910		}
1911	}
1912
1913	return deqs;
1914}
1915
1916static boolean
1917nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
1918{
1919	struct tgsi_full_instruction insn = tok->FullInstruction;
1920	const struct tgsi_full_dst_register *fd;
1921	unsigned i, deqs, rdep[4], m[4];
1922
1923	fd = &tok->FullInstruction.FullDstRegisters[0];
1924	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
1925
1926	if (is_scalar_op(insn.Instruction.Opcode)) {
1927		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
1928		if (!pc->r_brdc)
1929			pc->r_brdc = temp_temp(pc);
1930		return nv50_program_tx_insn(pc, &insn);
1931	}
1932	pc->r_brdc = NULL;
1933
1934	if (!deqs)
1935		return nv50_program_tx_insn(pc, &insn);
1936
1937	deqs = nv50_revdep_reorder(m, rdep);
1938
1939	for (i = 0; i < 4; ++i) {
1940		assert(pc->r_dst[m[i]] == NULL);
1941
1942		insn.FullDstRegisters[0].DstRegister.WriteMask =
1943			fd->DstRegister.WriteMask & (1 << m[i]);
1944
1945		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
1946			continue;
1947
1948		if (deqs & (1 << i))
1949			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
1950
1951		if (!nv50_program_tx_insn(pc, &insn))
1952			return FALSE;
1953	}
1954
1955	for (i = 0; i < 4; i++) {
1956		struct nv50_reg *reg = pc->r_dst[i];
1957		if (!reg)
1958			continue;
1959		pc->r_dst[i] = NULL;
1960
1961		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
1962			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
1963		else
1964			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
1965		free_temp(pc, reg);
1966	}
1967
1968	return TRUE;
1969}
1970
1971static unsigned
1972load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
1973	       int *aid, int *p_oid)
1974{
1975	struct nv50_reg *iv;
1976	int oid, c, n;
1977	unsigned mask = 0;
1978
1979	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
1980
1981	for (c = 0, n = i * 4; c < 4; c++, n++) {
1982		oid = (*p_oid)++;
1983		pc->attr[n].type = P_TEMP;
1984		pc->attr[n].index = i;
1985
1986		if (pc->attr[n].acc == acc[n])
1987			continue;
1988		mask |= (1 << c);
1989
1990		pc->attr[n].acc = acc[n];
1991		pc->attr[n].rhw = pc->attr[n].hw = -1;
1992		alloc_reg(pc, &pc->attr[n]);
1993
1994		pc->attr[n].rhw = (*aid)++;
1995		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
1996
1997		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
1998		(*mid)++;
1999		pc->p->cfg.fp.regs[1] += 0x00010001;
2000	}
2001
2002	return mask;
2003}
2004
2005static boolean
2006nv50_program_tx_prep(struct nv50_pc *pc)
2007{
2008	struct tgsi_parse_context p;
2009	boolean ret = FALSE;
2010	unsigned i, c;
2011	unsigned fcol, bcol, fcrd, depr;
2012
2013	/* count (centroid) perspective interpolations */
2014	unsigned centroid_loads = 0;
2015	unsigned perspect_loads = 0;
2016
2017	/* track register access for temps and attrs */
2018	unsigned *r_usage[2];
2019	r_usage[0] = NULL;
2020	r_usage[1] = NULL;
2021
2022	depr = fcol = bcol = fcrd = 0xffff;
2023
2024	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2025		pc->p->cfg.fp.regs[0] = 0x01000404;
2026		pc->p->cfg.fp.regs[1] = 0x00000400;
2027	}
2028
2029	tgsi_parse_init(&p, pc->p->pipe.tokens);
2030	while (!tgsi_parse_end_of_tokens(&p)) {
2031		const union tgsi_full_token *tok = &p.FullToken;
2032
2033		tgsi_parse_token(&p);
2034		switch (tok->Token.Type) {
2035		case TGSI_TOKEN_TYPE_IMMEDIATE:
2036		{
2037			const struct tgsi_full_immediate *imm =
2038				&p.FullToken.FullImmediate;
2039
2040			ctor_immd(pc, imm->u[0].Float,
2041				      imm->u[1].Float,
2042				      imm->u[2].Float,
2043				      imm->u[3].Float);
2044		}
2045			break;
2046		case TGSI_TOKEN_TYPE_DECLARATION:
2047		{
2048			const struct tgsi_full_declaration *d;
2049			unsigned last, first, mode;
2050
2051			d = &p.FullToken.FullDeclaration;
2052			first = d->DeclarationRange.First;
2053			last = d->DeclarationRange.Last;
2054
2055			switch (d->Declaration.File) {
2056			case TGSI_FILE_TEMPORARY:
2057				if (pc->temp_nr < (last + 1))
2058					pc->temp_nr = last + 1;
2059				break;
2060			case TGSI_FILE_OUTPUT:
2061				if (pc->result_nr < (last + 1))
2062					pc->result_nr = last + 1;
2063
2064				if (!d->Declaration.Semantic)
2065					break;
2066
2067				switch (d->Semantic.SemanticName) {
2068				case TGSI_SEMANTIC_POSITION:
2069					depr = first;
2070					pc->p->cfg.fp.regs[2] |= 0x00000100;
2071					pc->p->cfg.fp.regs[3] |= 0x00000011;
2072					break;
2073				default:
2074					break;
2075				}
2076
2077				break;
2078			case TGSI_FILE_INPUT:
2079			{
2080				if (pc->attr_nr < (last + 1))
2081					pc->attr_nr = last + 1;
2082
2083				if (pc->p->type != PIPE_SHADER_FRAGMENT)
2084					break;
2085
2086				switch (d->Declaration.Interpolate) {
2087				case TGSI_INTERPOLATE_CONSTANT:
2088					mode = INTERP_FLAT;
2089					break;
2090				case TGSI_INTERPOLATE_PERSPECTIVE:
2091					mode = INTERP_PERSPECTIVE;
2092					break;
2093				default:
2094					mode = INTERP_LINEAR;
2095					break;
2096				}
2097
2098				if (d->Declaration.Semantic) {
2099					switch (d->Semantic.SemanticName) {
2100					case TGSI_SEMANTIC_POSITION:
2101						fcrd = first;
2102						break;
2103					case TGSI_SEMANTIC_COLOR:
2104						fcol = first;
2105						mode = INTERP_PERSPECTIVE;
2106						break;
2107					case TGSI_SEMANTIC_BCOLOR:
2108						bcol = first;
2109						mode = INTERP_PERSPECTIVE;
2110						break;
2111					}
2112				}
2113
2114				if (d->Declaration.Centroid) {
2115					mode |= INTERP_CENTROID;
2116					if (mode & INTERP_PERSPECTIVE)
2117						centroid_loads++;
2118				} else
2119				if (mode & INTERP_PERSPECTIVE)
2120					perspect_loads++;
2121
2122				assert(last < 32);
2123				for (i = first; i <= last; i++)
2124					pc->interp_mode[i] = mode;
2125			}
2126				break;
2127			case TGSI_FILE_CONSTANT:
2128				if (pc->param_nr < (last + 1))
2129					pc->param_nr = last + 1;
2130				break;
2131			case TGSI_FILE_SAMPLER:
2132				break;
2133			default:
2134				NOUVEAU_ERR("bad decl file %d\n",
2135					    d->Declaration.File);
2136				goto out_err;
2137			}
2138		}
2139			break;
2140		case TGSI_TOKEN_TYPE_INSTRUCTION:
2141			pc->insn_nr++;
2142			prep_inspect_insn(pc, tok, r_usage);
2143			break;
2144		default:
2145			break;
2146		}
2147	}
2148
2149	if (pc->temp_nr) {
2150		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
2151		if (!pc->temp)
2152			goto out_err;
2153
2154		for (i = 0; i < pc->temp_nr; i++) {
2155			for (c = 0; c < 4; c++) {
2156				pc->temp[i*4+c].type = P_TEMP;
2157				pc->temp[i*4+c].hw = -1;
2158				pc->temp[i*4+c].rhw = -1;
2159				pc->temp[i*4+c].index = i;
2160				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
2161			}
2162		}
2163	}
2164
2165	if (pc->attr_nr) {
2166		int oid = 4, mid = 4, aid = 0;
2167		/* oid = VP output id
2168		 * aid = FP attribute/interpolant id
2169		 * mid = VP output mapping field ID
2170		 */
2171
2172		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
2173		if (!pc->attr)
2174			goto out_err;
2175
2176		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2177			/* position should be loaded first */
2178			if (fcrd != 0xffff) {
2179				unsigned mask;
2180				mid = 0;
2181				mask = load_fp_attrib(pc, fcrd, r_usage[1],
2182						      &mid, &aid, &oid);
2183				oid = 0;
2184				pc->p->cfg.fp.regs[1] |= (mask << 24);
2185				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
2186			}
2187			pc->p->cfg.fp.map[0] += 0x03020100;
2188
2189			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
2190
2191			if (perspect_loads) {
2192				pc->iv_p = alloc_temp(pc, NULL);
2193
2194				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
2195					pc->p->cfg.fp.regs[1] |= 0x08000000;
2196					pc->iv_p->rhw = aid++;
2197					emit_interp(pc, pc->iv_p, NULL,
2198						    INTERP_LINEAR);
2199					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
2200				} else {
2201					pc->iv_p->rhw = aid - 1;
2202					emit_flop(pc, 0, pc->iv_p,
2203						  &pc->attr[fcrd * 4 + 3]);
2204				}
2205			}
2206
2207			if (centroid_loads) {
2208				pc->iv_c = alloc_temp(pc, NULL);
2209				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
2210				emit_interp(pc, pc->iv_c, NULL,
2211					    INTERP_CENTROID);
2212				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
2213				pc->p->cfg.fp.regs[1] |= 0x08000000;
2214			}
2215
2216			for (c = 0; c < 4; c++) {
2217				/* I don't know what these values do, but
2218				 * let's set them like the blob does:
2219				 */
2220				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
2221					pc->p->cfg.fp.regs[0] += 0x00010000;
2222				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
2223					pc->p->cfg.fp.regs[0] += 0x00010000;
2224			}
2225
2226			for (i = 0; i < pc->attr_nr; i++)
2227				load_fp_attrib(pc, i, r_usage[1],
2228					       &mid, &aid, &oid);
2229
2230			if (pc->iv_p)
2231				free_temp(pc, pc->iv_p);
2232			if (pc->iv_c)
2233				free_temp(pc, pc->iv_c);
2234
2235			pc->p->cfg.fp.high_map = (mid / 4);
2236			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
2237		} else {
2238			/* vertex program */
2239			for (i = 0; i < pc->attr_nr * 4; i++) {
2240				pc->p->cfg.vp.attr[aid / 32] |=
2241					(1 << (aid % 32));
2242				pc->attr[i].type = P_ATTR;
2243				pc->attr[i].hw = aid++;
2244				pc->attr[i].index = i / 4;
2245			}
2246		}
2247	}
2248
2249	if (pc->result_nr) {
2250		int rid = 0;
2251
2252		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
2253		if (!pc->result)
2254			goto out_err;
2255
2256		for (i = 0; i < pc->result_nr; i++) {
2257			for (c = 0; c < 4; c++) {
2258				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
2259					pc->result[i*4+c].type = P_TEMP;
2260					pc->result[i*4+c].hw = -1;
2261					pc->result[i*4+c].rhw = (i == depr) ?
2262						-1 : rid++;
2263				} else {
2264					pc->result[i*4+c].type = P_RESULT;
2265					pc->result[i*4+c].hw = rid++;
2266				}
2267				pc->result[i*4+c].index = i;
2268			}
2269
2270			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
2271			    depr != 0xffff) {
2272				pc->result[depr * 4 + 2].rhw =
2273					(pc->result_nr - 1) * 4;
2274			}
2275		}
2276	}
2277
2278	if (pc->param_nr) {
2279		int rid = 0;
2280
2281		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
2282		if (!pc->param)
2283			goto out_err;
2284
2285		for (i = 0; i < pc->param_nr; i++) {
2286			for (c = 0; c < 4; c++) {
2287				pc->param[i*4+c].type = P_CONST;
2288				pc->param[i*4+c].hw = rid++;
2289				pc->param[i*4+c].index = i;
2290			}
2291		}
2292	}
2293
2294	if (pc->immd_nr) {
2295		int rid = 0;
2296
2297		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
2298		if (!pc->immd)
2299			goto out_err;
2300
2301		for (i = 0; i < pc->immd_nr; i++) {
2302			for (c = 0; c < 4; c++) {
2303				pc->immd[i*4+c].type = P_IMMD;
2304				pc->immd[i*4+c].hw = rid++;
2305				pc->immd[i*4+c].index = i;
2306			}
2307		}
2308	}
2309
2310	ret = TRUE;
2311out_err:
2312	if (r_usage[0])
2313		FREE(r_usage[0]);
2314	if (r_usage[1])
2315		FREE(r_usage[1]);
2316
2317	tgsi_parse_free(&p);
2318	return ret;
2319}
2320
2321static void
2322free_nv50_pc(struct nv50_pc *pc)
2323{
2324	if (pc->immd)
2325		FREE(pc->immd);
2326	if (pc->param)
2327		FREE(pc->param);
2328	if (pc->result)
2329		FREE(pc->result);
2330	if (pc->attr)
2331		FREE(pc->attr);
2332	if (pc->temp)
2333		FREE(pc->temp);
2334
2335	FREE(pc);
2336}
2337
2338static boolean
2339nv50_program_tx(struct nv50_program *p)
2340{
2341	struct tgsi_parse_context parse;
2342	struct nv50_pc *pc;
2343	unsigned k;
2344	boolean ret;
2345
2346	pc = CALLOC_STRUCT(nv50_pc);
2347	if (!pc)
2348		return FALSE;
2349	pc->p = p;
2350	pc->p->cfg.high_temp = 4;
2351
2352	ret = nv50_program_tx_prep(pc);
2353	if (ret == FALSE)
2354		goto out_cleanup;
2355
2356	tgsi_parse_init(&parse, pc->p->pipe.tokens);
2357	while (!tgsi_parse_end_of_tokens(&parse)) {
2358		const union tgsi_full_token *tok = &parse.FullToken;
2359
2360		/* don't allow half insn/immd on first and last instruction */
2361		pc->allow32 = TRUE;
2362		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
2363			pc->allow32 = FALSE;
2364
2365		tgsi_parse_token(&parse);
2366
2367		switch (tok->Token.Type) {
2368		case TGSI_TOKEN_TYPE_INSTRUCTION:
2369			++pc->insn_cur;
2370			ret = nv50_tgsi_insn(pc, tok);
2371			if (ret == FALSE)
2372				goto out_err;
2373			break;
2374		default:
2375			break;
2376		}
2377	}
2378
2379	if (p->type == PIPE_SHADER_FRAGMENT) {
2380		struct nv50_reg out;
2381
2382		out.type = P_TEMP;
2383		for (k = 0; k < pc->result_nr * 4; k++) {
2384			if (pc->result[k].rhw == -1)
2385				continue;
2386			if (pc->result[k].hw != pc->result[k].rhw) {
2387				out.hw = pc->result[k].rhw;
2388				emit_mov(pc, &out, &pc->result[k]);
2389			}
2390			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
2391				pc->p->cfg.high_result = pc->result[k].rhw + 1;
2392		}
2393	}
2394
2395	/* look for single half instructions and make them long */
2396	struct nv50_program_exec *e, *e_prev;
2397
2398	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
2399		if (!is_long(e))
2400			k++;
2401
2402		if (!e->next || is_long(e->next)) {
2403			if (k & 1)
2404				convert_to_long(pc, e);
2405			k = 0;
2406		}
2407
2408		if (e->next)
2409			e_prev = e;
2410	}
2411
2412	if (!is_long(pc->p->exec_tail)) {
2413		/* this may occur if moving FP results */
2414		assert(e_prev && !is_long(e_prev));
2415		convert_to_long(pc, e_prev);
2416		convert_to_long(pc, pc->p->exec_tail);
2417	}
2418
2419	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
2420	pc->p->exec_tail->inst[1] |= 0x00000001;
2421
2422	p->param_nr = pc->param_nr * 4;
2423	p->immd_nr = pc->immd_nr * 4;
2424	p->immd = pc->immd_buf;
2425
2426out_err:
2427	tgsi_parse_free(&parse);
2428
2429out_cleanup:
2430	free_nv50_pc(pc);
2431	return ret;
2432}
2433
2434static void
2435nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
2436{
2437	if (nv50_program_tx(p) == FALSE)
2438		assert(0);
2439	p->translated = TRUE;
2440}
2441
2442static void
2443nv50_program_upload_data(struct nv50_context *nv50, float *map,
2444			unsigned start, unsigned count, unsigned cbuf)
2445{
2446	struct nouveau_channel *chan = nv50->screen->base.channel;
2447	struct nouveau_grobj *tesla = nv50->screen->tesla;
2448
2449	while (count) {
2450		unsigned nr = count > 2047 ? 2047 : count;
2451
2452		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2453		OUT_RING  (chan, (cbuf << 0) | (start << 8));
2454		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2455		OUT_RINGp (chan, map, nr);
2456
2457		map += nr;
2458		start += nr;
2459		count -= nr;
2460	}
2461}
2462
2463static void
2464nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
2465{
2466	struct pipe_screen *pscreen = nv50->pipe.screen;
2467
2468	if (!p->data[0] && p->immd_nr) {
2469		struct nouveau_resource *heap = nv50->screen->immd_heap[0];
2470
2471		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
2472			while (heap->next && heap->size < p->immd_nr) {
2473				struct nv50_program *evict = heap->next->priv;
2474				nouveau_resource_free(&evict->data[0]);
2475			}
2476
2477			if (nouveau_resource_alloc(heap, p->immd_nr, p,
2478						   &p->data[0]))
2479				assert(0);
2480		}
2481
2482		/* immediates only need to be uploaded again when freed */
2483		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
2484					 p->immd_nr, NV50_CB_PMISC);
2485	}
2486
2487	if (!p->data[1] && p->param_nr) {
2488		struct nouveau_resource *heap =
2489			nv50->screen->parm_heap[p->type];
2490
2491		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
2492			while (heap->next && heap->size < p->param_nr) {
2493				struct nv50_program *evict = heap->next->priv;
2494				nouveau_resource_free(&evict->data[1]);
2495			}
2496
2497			if (nouveau_resource_alloc(heap, p->param_nr, p,
2498						   &p->data[1]))
2499				assert(0);
2500		}
2501	}
2502
2503	if (p->param_nr) {
2504		unsigned cbuf = NV50_CB_PVP;
2505		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
2506					     PIPE_BUFFER_USAGE_CPU_READ);
2507		if (p->type == PIPE_SHADER_FRAGMENT)
2508			cbuf = NV50_CB_PFP;
2509		nv50_program_upload_data(nv50, map, p->data[1]->start,
2510					 p->param_nr, cbuf);
2511		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
2512	}
2513}
2514
2515static void
2516nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
2517{
2518	struct nouveau_channel *chan = nv50->screen->base.channel;
2519	struct nouveau_grobj *tesla = nv50->screen->tesla;
2520	struct nv50_program_exec *e;
2521	struct nouveau_stateobj *so;
2522	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
2523	unsigned start, count, *up, *ptr;
2524	boolean upload = FALSE;
2525
2526	if (!p->bo) {
2527		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
2528			       p->exec_size * 4, &p->bo);
2529		upload = TRUE;
2530	}
2531
2532	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
2533		(p->data[1] && p->data[1]->start != p->data_start[1])) {
2534		for (e = p->exec_head; e; e = e->next) {
2535			unsigned ei, ci, bs;
2536
2537			if (e->param.index < 0)
2538				continue;
2539			bs = (e->inst[1] >> 22) & 0x07;
2540			assert(bs < 2);
2541			ei = e->param.shift >> 5;
2542			ci = e->param.index + p->data[bs]->start;
2543
2544			e->inst[ei] &= ~e->param.mask;
2545			e->inst[ei] |= (ci << e->param.shift);
2546		}
2547
2548		if (p->data[0])
2549			p->data_start[0] = p->data[0]->start;
2550		if (p->data[1])
2551			p->data_start[1] = p->data[1]->start;
2552
2553		upload = TRUE;
2554	}
2555
2556	if (!upload)
2557		return;
2558
2559#ifdef NV50_PROGRAM_DUMP
2560	NOUVEAU_ERR("-------\n");
2561	for (e = p->exec_head; e; e = e->next) {
2562		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
2563		if (is_long(e))
2564			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
2565	}
2566#endif
2567
2568	up = ptr = MALLOC(p->exec_size * 4);
2569	for (e = p->exec_head; e; e = e->next) {
2570		*(ptr++) = e->inst[0];
2571		if (is_long(e))
2572			*(ptr++) = e->inst[1];
2573	}
2574
2575	so = so_new(4,2);
2576	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
2577	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
2578	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
2579	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
2580
2581	start = 0; count = p->exec_size;
2582	while (count) {
2583		struct nouveau_channel *chan = nv50->screen->base.channel;
2584		unsigned nr;
2585
2586		so_emit(chan, so);
2587
2588		nr = MIN2(count, 2047);
2589		nr = MIN2(chan->pushbuf->remaining, nr);
2590		if (chan->pushbuf->remaining < (nr + 3)) {
2591			FIRE_RING(chan);
2592			continue;
2593		}
2594
2595		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
2596		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
2597		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
2598		OUT_RINGp (chan, up + start, nr);
2599
2600		start += nr;
2601		count -= nr;
2602	}
2603
2604	FREE(up);
2605	so_ref(NULL, &so);
2606}
2607
2608void
2609nv50_vertprog_validate(struct nv50_context *nv50)
2610{
2611	struct nouveau_grobj *tesla = nv50->screen->tesla;
2612	struct nv50_program *p = nv50->vertprog;
2613	struct nouveau_stateobj *so;
2614
2615	if (!p->translated) {
2616		nv50_program_validate(nv50, p);
2617		if (!p->translated)
2618			assert(0);
2619	}
2620
2621	nv50_program_validate_data(nv50, p);
2622	nv50_program_validate_code(nv50, p);
2623
2624	so = so_new(13, 2);
2625	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
2626	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2627		      NOUVEAU_BO_HIGH, 0, 0);
2628	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2629		      NOUVEAU_BO_LOW, 0, 0);
2630	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
2631	so_data  (so, p->cfg.vp.attr[0]);
2632	so_data  (so, p->cfg.vp.attr[1]);
2633	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
2634	so_data  (so, p->cfg.high_result);
2635	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
2636	so_data  (so, p->cfg.high_result); //8);
2637	so_data  (so, p->cfg.high_temp);
2638	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
2639	so_data  (so, 0); /* program start offset */
2640	so_ref(so, &nv50->state.vertprog);
2641	so_ref(NULL, &so);
2642}
2643
2644void
2645nv50_fragprog_validate(struct nv50_context *nv50)
2646{
2647	struct nouveau_grobj *tesla = nv50->screen->tesla;
2648	struct nv50_program *p = nv50->fragprog;
2649	struct nouveau_stateobj *so;
2650	unsigned i;
2651
2652	if (!p->translated) {
2653		nv50_program_validate(nv50, p);
2654		if (!p->translated)
2655			assert(0);
2656	}
2657
2658	nv50_program_validate_data(nv50, p);
2659	nv50_program_validate_code(nv50, p);
2660
2661	so = so_new(64, 2);
2662	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
2663	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2664		      NOUVEAU_BO_HIGH, 0, 0);
2665	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
2666		      NOUVEAU_BO_LOW, 0, 0);
2667	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
2668	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
2669	so_data  (so, 0x00000004);
2670	so_data  (so, 0x00000000);
2671	so_data  (so, 0x00000000);
2672	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
2673	for (i = 0; i < p->cfg.fp.high_map; i++)
2674		so_data(so, p->cfg.fp.map[i]);
2675	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
2676	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
2677	so_data  (so, p->cfg.high_temp);
2678	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
2679	so_data  (so, p->cfg.high_result);
2680	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
2681	so_data  (so, p->cfg.fp.regs[2]);
2682	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
2683	so_data  (so, p->cfg.fp.regs[3]);
2684	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
2685	so_data  (so, 0); /* program start offset */
2686	so_ref(so, &nv50->state.fragprog);
2687	so_ref(NULL, &so);
2688}
2689
2690void
2691nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
2692{
2693	while (p->exec_head) {
2694		struct nv50_program_exec *e = p->exec_head;
2695
2696		p->exec_head = e->next;
2697		FREE(e);
2698	}
2699	p->exec_tail = NULL;
2700	p->exec_size = 0;
2701
2702	nouveau_bo_ref(NULL, &p->bo);
2703
2704	nouveau_resource_free(&p->data[0]);
2705	nouveau_resource_free(&p->data[1]);
2706
2707	p->translated = 0;
2708}
2709