1/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3/*
4 * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 *    Rob Clark <robclark@freedesktop.org>
27 */
28
29#include "pipe/p_state.h"
30#include "util/u_string.h"
31#include "util/u_memory.h"
32#include "util/u_inlines.h"
33#include "tgsi/tgsi_parse.h"
34#include "tgsi/tgsi_ureg.h"
35#include "tgsi/tgsi_info.h"
36#include "tgsi/tgsi_strings.h"
37#include "tgsi/tgsi_dump.h"
38
39#include "fd2_compiler.h"
40#include "fd2_program.h"
41#include "fd2_util.h"
42
43#include "instr-a2xx.h"
44#include "ir-a2xx.h"
45
46struct fd2_compile_context {
47	struct fd_program_stateobj *prog;
48	struct fd2_shader_stateobj *so;
49
50	struct tgsi_parse_context parser;
51	unsigned type;
52
53	/* predicate stack: */
54	int pred_depth;
55	enum ir2_pred pred_stack[8];
56
57	/* Internal-Temporary and Predicate register assignment:
58	 *
59	 * Some TGSI instructions which translate into multiple actual
60	 * instructions need one or more temporary registers, which are not
61	 * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY).
62	 * And some instructions (texture fetch) cannot write directly to
63	 * output registers.  We could be more clever and re-use dst or a
64	 * src register in some cases.  But for now don't try to be clever.
65	 * Eventually we should implement an optimization pass that re-
66	 * juggles the register usage and gets rid of unneeded temporaries.
67	 *
68	 * The predicate register must be valid across multiple TGSI
69	 * instructions, but internal temporary's do not.  For this reason,
70	 * once the predicate register is requested, until it is no longer
71	 * needed, it gets the first register slot after after the TGSI
72	 * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the
73	 * internal temporaries get the register slots above this.
74	 */
75
76	int pred_reg;
77	int num_internal_temps;
78
79	uint8_t num_regs[TGSI_FILE_COUNT];
80
81	/* maps input register idx to prog->export_linkage idx: */
82	uint8_t input_export_idx[64];
83
84	/* maps output register idx to prog->export_linkage idx: */
85	uint8_t output_export_idx[64];
86
87	/* idx/slot for last compiler generated immediate */
88	unsigned immediate_idx;
89
90	// TODO we can skip emit exports in the VS that the FS doesn't need..
91	// and get rid perhaps of num_param..
92	unsigned num_position, num_param;
93	unsigned position, psize;
94
95	uint64_t need_sync;
96
97	/* current exec CF instruction */
98	struct ir2_cf *cf;
99};
100
101static int
102semantic_idx(struct tgsi_declaration_semantic *semantic)
103{
104	int idx = semantic->Name;
105	if (idx == TGSI_SEMANTIC_GENERIC)
106		idx = TGSI_SEMANTIC_COUNT + semantic->Index;
107	return idx;
108}
109
110/* assign/get the input/export register # for given semantic idx as
111 * returned by semantic_idx():
112 */
113static int
114export_linkage(struct fd2_compile_context *ctx, int idx)
115{
116	struct fd_program_stateobj *prog = ctx->prog;
117
118	/* if first time we've seen this export, assign the next available slot: */
119	if (prog->export_linkage[idx] == 0xff)
120		prog->export_linkage[idx] = prog->num_exports++;
121
122	return prog->export_linkage[idx];
123}
124
125static unsigned
126compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
127		struct fd2_shader_stateobj *so)
128{
129	unsigned ret;
130
131	ctx->prog = prog;
132	ctx->so = so;
133	ctx->cf = NULL;
134	ctx->pred_depth = 0;
135
136	ret = tgsi_parse_init(&ctx->parser, so->tokens);
137	if (ret != TGSI_PARSE_OK)
138		return ret;
139
140	ctx->type = ctx->parser.FullHeader.Processor.Processor;
141	ctx->position = ~0;
142	ctx->psize = ~0;
143	ctx->num_position = 0;
144	ctx->num_param = 0;
145	ctx->need_sync = 0;
146	ctx->immediate_idx = 0;
147	ctx->pred_reg = -1;
148	ctx->num_internal_temps = 0;
149
150	memset(ctx->num_regs, 0, sizeof(ctx->num_regs));
151	memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx));
152	memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx));
153
154	/* do first pass to extract declarations: */
155	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
156		tgsi_parse_token(&ctx->parser);
157
158		switch (ctx->parser.FullToken.Token.Type) {
159		case TGSI_TOKEN_TYPE_DECLARATION: {
160			struct tgsi_full_declaration *decl =
161					&ctx->parser.FullToken.FullDeclaration;
162			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
163				unsigned name = decl->Semantic.Name;
164
165				assert(decl->Declaration.Semantic);  // TODO is this ever not true?
166
167				ctx->output_export_idx[decl->Range.First] =
168						semantic_idx(&decl->Semantic);
169
170				if (ctx->type == PIPE_SHADER_VERTEX) {
171					switch (name) {
172					case TGSI_SEMANTIC_POSITION:
173						ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT];
174						ctx->num_position++;
175						break;
176					case TGSI_SEMANTIC_PSIZE:
177						ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT];
178						ctx->num_position++;
179						break;
180					case TGSI_SEMANTIC_COLOR:
181					case TGSI_SEMANTIC_GENERIC:
182						ctx->num_param++;
183						break;
184					default:
185						DBG("unknown VS semantic name: %s",
186								tgsi_semantic_names[name]);
187						assert(0);
188					}
189				} else {
190					switch (name) {
191					case TGSI_SEMANTIC_COLOR:
192					case TGSI_SEMANTIC_GENERIC:
193						ctx->num_param++;
194						break;
195					default:
196						DBG("unknown PS semantic name: %s",
197								tgsi_semantic_names[name]);
198						assert(0);
199					}
200				}
201			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
202				ctx->input_export_idx[decl->Range.First] =
203						semantic_idx(&decl->Semantic);
204			}
205			ctx->num_regs[decl->Declaration.File] =
206					MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1);
207			break;
208		}
209		case TGSI_TOKEN_TYPE_IMMEDIATE: {
210			struct tgsi_full_immediate *imm =
211					&ctx->parser.FullToken.FullImmediate;
212			unsigned n = ctx->so->num_immediates++;
213			memcpy(ctx->so->immediates[n].val, imm->u, 16);
214			break;
215		}
216		default:
217			break;
218		}
219	}
220
221	/* TGSI generated immediates are always entire vec4's, ones we
222	 * generate internally are not:
223	 */
224	ctx->immediate_idx = ctx->so->num_immediates * 4;
225
226	ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT];
227
228	tgsi_parse_free(&ctx->parser);
229
230	return tgsi_parse_init(&ctx->parser, so->tokens);
231}
232
233static void
234compile_free(struct fd2_compile_context *ctx)
235{
236	tgsi_parse_free(&ctx->parser);
237}
238
239static struct ir2_cf *
240next_exec_cf(struct fd2_compile_context *ctx)
241{
242	struct ir2_cf *cf = ctx->cf;
243	if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
244		ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
245	return cf;
246}
247
248static void
249compile_vtx_fetch(struct fd2_compile_context *ctx)
250{
251	struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs;
252	int i;
253	for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
254		struct ir2_instruction *instr = ir2_instr_create(
255				next_exec_cf(ctx), IR2_FETCH);
256		instr->fetch.opc = VTX_FETCH;
257
258		ctx->need_sync |= 1 << (i+1);
259
260		ir2_reg_create(instr, i+1, "xyzw", 0);
261		ir2_reg_create(instr, 0, "x", 0);
262
263		if (i == 0)
264			instr->sync = true;
265
266		vfetch_instrs[i] = instr;
267	}
268	ctx->so->num_vfetch_instrs = i;
269	ctx->cf = NULL;
270}
271
272/*
273 * For vertex shaders (VS):
274 * --- ------ -------------
275 *
276 *   Inputs:     R1-R(num_input)
277 *   Constants:  C0-C(num_const-1)
278 *   Immediates: C(num_const)-C(num_const+num_imm-1)
279 *   Outputs:    export0-export(n) and export62, export63
280 *      n is # of outputs minus gl_Position (export62) and gl_PointSize (export63)
281 *   Temps:      R(num_input+1)-R(num_input+num_temps)
282 *
283 * R0 could be clobbered after the vertex fetch instructions.. so we
284 * could use it for one of the temporaries.
285 *
286 * TODO: maybe the vertex fetch part could fetch first input into R0 as
287 * the last vtx fetch instruction, which would let us use the same
288 * register layout in either case.. although this is not what the blob
289 * compiler does.
290 *
291 *
292 * For frag shaders (PS):
293 * --- ---- -------------
294 *
295 *   Inputs:     R0-R(num_input-1)
296 *   Constants:  same as VS
297 *   Immediates: same as VS
298 *   Outputs:    export0-export(num_outputs)
299 *   Temps:      R(num_input)-R(num_input+num_temps-1)
300 *
301 * In either case, immediates are are postpended to the constants
302 * (uniforms).
303 *
304 */
305
306static unsigned
307get_temp_gpr(struct fd2_compile_context *ctx, int idx)
308{
309	unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
310	if (ctx->type == PIPE_SHADER_VERTEX)
311		num++;
312	return num;
313}
314
315static struct ir2_register *
316add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
317		const struct tgsi_dst_register *dst)
318{
319	unsigned flags = 0, num = 0;
320	char swiz[5];
321
322	switch (dst->File) {
323	case TGSI_FILE_OUTPUT:
324		flags |= IR2_REG_EXPORT;
325		if (ctx->type == PIPE_SHADER_VERTEX) {
326			if (dst->Index == ctx->position) {
327				num = 62;
328			} else if (dst->Index == ctx->psize) {
329				num = 63;
330			} else {
331				num = export_linkage(ctx,
332						ctx->output_export_idx[dst->Index]);
333			}
334		} else {
335			num = dst->Index;
336		}
337		break;
338	case TGSI_FILE_TEMPORARY:
339		num = get_temp_gpr(ctx, dst->Index);
340		break;
341	default:
342		DBG("unsupported dst register file: %s",
343			tgsi_file_name(dst->File));
344		assert(0);
345		break;
346	}
347
348	swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_';
349	swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_';
350	swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_';
351	swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
352	swiz[4] = '\0';
353
354	return ir2_reg_create(alu, num, swiz, flags);
355}
356
357static struct ir2_register *
358add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
359		const struct tgsi_src_register *src)
360{
361	static const char swiz_vals[] = {
362			'x', 'y', 'z', 'w',
363	};
364	char swiz[5];
365	unsigned flags = 0, num = 0;
366
367	switch (src->File) {
368	case TGSI_FILE_CONSTANT:
369		num = src->Index;
370		flags |= IR2_REG_CONST;
371		break;
372	case TGSI_FILE_INPUT:
373		if (ctx->type == PIPE_SHADER_VERTEX) {
374			num = src->Index + 1;
375		} else {
376			num = export_linkage(ctx,
377					ctx->input_export_idx[src->Index]);
378		}
379		break;
380	case TGSI_FILE_TEMPORARY:
381		num = get_temp_gpr(ctx, src->Index);
382		break;
383	case TGSI_FILE_IMMEDIATE:
384		num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT];
385		flags |= IR2_REG_CONST;
386		break;
387	default:
388		DBG("unsupported src register file: %s",
389			tgsi_file_name(src->File));
390		assert(0);
391		break;
392	}
393
394	if (src->Absolute)
395		flags |= IR2_REG_ABS;
396	if (src->Negate)
397		flags |= IR2_REG_NEGATE;
398
399	swiz[0] = swiz_vals[src->SwizzleX];
400	swiz[1] = swiz_vals[src->SwizzleY];
401	swiz[2] = swiz_vals[src->SwizzleZ];
402	swiz[3] = swiz_vals[src->SwizzleW];
403	swiz[4] = '\0';
404
405	if ((ctx->need_sync & ((uint64_t)1 << num)) &&
406			!(flags & IR2_REG_CONST)) {
407		alu->sync = true;
408		ctx->need_sync &= ~((uint64_t)1 << num);
409	}
410
411	return ir2_reg_create(alu, num, swiz, flags);
412}
413
414static void
415add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
416{
417	if (inst->Instruction.Saturate) {
418		alu->alu.vector_clamp = true;
419	}
420}
421
422static void
423add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
424{
425	if (inst->Instruction.Saturate) {
426		alu->alu.scalar_clamp = true;
427	}
428}
429
430static void
431add_regs_vector_1(struct fd2_compile_context *ctx,
432		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
433{
434	assert(inst->Instruction.NumSrcRegs == 1);
435	assert(inst->Instruction.NumDstRegs == 1);
436
437	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
438	add_src_reg(ctx, alu, &inst->Src[0].Register);
439	add_src_reg(ctx, alu, &inst->Src[0].Register);
440	add_vector_clamp(inst, alu);
441}
442
443static void
444add_regs_vector_2(struct fd2_compile_context *ctx,
445		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
446{
447	assert(inst->Instruction.NumSrcRegs == 2);
448	assert(inst->Instruction.NumDstRegs == 1);
449
450	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
451	add_src_reg(ctx, alu, &inst->Src[0].Register);
452	add_src_reg(ctx, alu, &inst->Src[1].Register);
453	add_vector_clamp(inst, alu);
454}
455
456static void
457add_regs_vector_3(struct fd2_compile_context *ctx,
458		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
459{
460	assert(inst->Instruction.NumSrcRegs == 3);
461	assert(inst->Instruction.NumDstRegs == 1);
462
463	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
464	/* maybe should re-arrange the syntax some day, but
465	 * in assembler/disassembler and what ir.c expects
466	 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
467	 */
468	add_src_reg(ctx, alu, &inst->Src[2].Register);
469	add_src_reg(ctx, alu, &inst->Src[0].Register);
470	add_src_reg(ctx, alu, &inst->Src[1].Register);
471	add_vector_clamp(inst, alu);
472}
473
474static void
475add_regs_dummy_vector(struct ir2_instruction *alu)
476{
477	/* create dummy, non-written vector dst/src regs
478	 * for unused vector instr slot:
479	 */
480	ir2_reg_create(alu, 0, "____", 0); /* vector dst */
481	ir2_reg_create(alu, 0, NULL, 0);   /* vector src1 */
482	ir2_reg_create(alu, 0, NULL, 0);   /* vector src2 */
483}
484
485static void
486add_regs_scalar_1(struct fd2_compile_context *ctx,
487		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
488{
489	assert(inst->Instruction.NumSrcRegs == 1);
490	assert(inst->Instruction.NumDstRegs == 1);
491
492	add_regs_dummy_vector(alu);
493
494	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
495	add_src_reg(ctx, alu, &inst->Src[0].Register);
496	add_scalar_clamp(inst, alu);
497}
498
499/*
500 * Helpers for TGSI instructions that don't map to a single shader instr:
501 */
502
503static void
504src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
505{
506	src->File      = dst->File;
507	src->Indirect  = dst->Indirect;
508	src->Dimension = dst->Dimension;
509	src->Index     = dst->Index;
510	src->Absolute  = 0;
511	src->Negate    = 0;
512	src->SwizzleX  = TGSI_SWIZZLE_X;
513	src->SwizzleY  = TGSI_SWIZZLE_Y;
514	src->SwizzleZ  = TGSI_SWIZZLE_Z;
515	src->SwizzleW  = TGSI_SWIZZLE_W;
516}
517
518/* Get internal-temp src/dst to use for a sequence of instructions
519 * generated by a single TGSI op.
520 */
521static void
522get_internal_temp(struct fd2_compile_context *ctx,
523		struct tgsi_dst_register *tmp_dst,
524		struct tgsi_src_register *tmp_src)
525{
526	int n;
527
528	tmp_dst->File      = TGSI_FILE_TEMPORARY;
529	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
530	tmp_dst->Indirect  = 0;
531	tmp_dst->Dimension = 0;
532
533	/* assign next temporary: */
534	n = ctx->num_internal_temps++;
535	if (ctx->pred_reg != -1)
536		n++;
537
538	tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n;
539
540	src_from_dst(tmp_src, tmp_dst);
541}
542
543static void
544get_predicate(struct fd2_compile_context *ctx, struct tgsi_dst_register *dst,
545		struct tgsi_src_register *src)
546{
547	assert(ctx->pred_reg != -1);
548
549	dst->File      = TGSI_FILE_TEMPORARY;
550	dst->WriteMask = TGSI_WRITEMASK_W;
551	dst->Indirect  = 0;
552	dst->Dimension = 0;
553	dst->Index     = get_temp_gpr(ctx, ctx->pred_reg);
554
555	if (src) {
556		src_from_dst(src, dst);
557		src->SwizzleX  = TGSI_SWIZZLE_W;
558		src->SwizzleY  = TGSI_SWIZZLE_W;
559		src->SwizzleZ  = TGSI_SWIZZLE_W;
560		src->SwizzleW  = TGSI_SWIZZLE_W;
561	}
562}
563
564static void
565push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
566{
567	struct ir2_instruction *alu;
568	struct tgsi_dst_register pred_dst;
569
570	/* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
571	 * themselves:
572	 */
573	ctx->cf = NULL;
574
575	if (ctx->pred_depth == 0) {
576		/* assign predicate register: */
577		ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
578
579		get_predicate(ctx, &pred_dst, NULL);
580
581		alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
582		add_regs_dummy_vector(alu);
583		add_dst_reg(ctx, alu, &pred_dst);
584		add_src_reg(ctx, alu, src);
585	} else {
586		struct tgsi_src_register pred_src;
587
588		get_predicate(ctx, &pred_dst, &pred_src);
589
590		alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
591		add_dst_reg(ctx, alu, &pred_dst);
592		add_src_reg(ctx, alu, &pred_src);
593		add_src_reg(ctx, alu, src);
594
595		// XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make
596		// sure src reg is valid if it was calculated with a predicate
597		// condition..
598		alu->pred = IR2_PRED_NONE;
599	}
600
601	/* save previous pred state to restore in pop_predicate(): */
602	ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
603
604	ctx->cf = NULL;
605}
606
607static void
608pop_predicate(struct fd2_compile_context *ctx)
609{
610	/* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
611	 * themselves:
612	 */
613	ctx->cf = NULL;
614
615	/* restore previous predicate state: */
616	ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
617
618	if (ctx->pred_depth != 0) {
619		struct ir2_instruction *alu;
620		struct tgsi_dst_register pred_dst;
621		struct tgsi_src_register pred_src;
622
623		get_predicate(ctx, &pred_dst, &pred_src);
624
625		alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
626		add_regs_dummy_vector(alu);
627		add_dst_reg(ctx, alu, &pred_dst);
628		add_src_reg(ctx, alu, &pred_src);
629		alu->pred = IR2_PRED_NONE;
630	} else {
631		/* predicate register no longer needed: */
632		ctx->pred_reg = -1;
633	}
634
635	ctx->cf = NULL;
636}
637
638static void
639get_immediate(struct fd2_compile_context *ctx,
640		struct tgsi_src_register *reg, uint32_t val)
641{
642	unsigned neg, swiz, idx, i;
643	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
644	static const unsigned swiz2tgsi[] = {
645			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
646	};
647
648	for (i = 0; i < ctx->immediate_idx; i++) {
649		swiz = i % 4;
650		idx  = i / 4;
651
652		if (ctx->so->immediates[idx].val[swiz] == val) {
653			neg = 0;
654			break;
655		}
656
657		if (ctx->so->immediates[idx].val[swiz] == -val) {
658			neg = 1;
659			break;
660		}
661	}
662
663	if (i == ctx->immediate_idx) {
664		/* need to generate a new immediate: */
665		swiz = i % 4;
666		idx  = i / 4;
667		neg  = 0;
668		ctx->so->immediates[idx].val[swiz] = val;
669		ctx->so->num_immediates = idx + 1;
670		ctx->immediate_idx++;
671	}
672
673	reg->File      = TGSI_FILE_IMMEDIATE;
674	reg->Indirect  = 0;
675	reg->Dimension = 0;
676	reg->Index     = idx;
677	reg->Absolute  = 0;
678	reg->Negate    = neg;
679	reg->SwizzleX  = swiz2tgsi[swiz];
680	reg->SwizzleY  = swiz2tgsi[swiz];
681	reg->SwizzleZ  = swiz2tgsi[swiz];
682	reg->SwizzleW  = swiz2tgsi[swiz];
683}
684
685/* POW(a,b) = EXP2(b * LOG2(a)) */
686static void
687translate_pow(struct fd2_compile_context *ctx,
688		struct tgsi_full_instruction *inst)
689{
690	struct tgsi_dst_register tmp_dst;
691	struct tgsi_src_register tmp_src;
692	struct ir2_instruction *alu;
693
694	get_internal_temp(ctx, &tmp_dst, &tmp_src);
695
696	alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
697	add_regs_dummy_vector(alu);
698	add_dst_reg(ctx, alu, &tmp_dst);
699	add_src_reg(ctx, alu, &inst->Src[0].Register);
700
701	alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
702	add_dst_reg(ctx, alu, &tmp_dst);
703	add_src_reg(ctx, alu, &tmp_src);
704	add_src_reg(ctx, alu, &inst->Src[1].Register);
705
706	/* NOTE: some of the instructions, like EXP_IEEE, seem hard-
707	 * coded to take their input from the w component.
708	 */
709	switch(inst->Dst[0].Register.WriteMask) {
710	case TGSI_WRITEMASK_X:
711		tmp_src.SwizzleW = TGSI_SWIZZLE_X;
712		break;
713	case TGSI_WRITEMASK_Y:
714		tmp_src.SwizzleW = TGSI_SWIZZLE_Y;
715		break;
716	case TGSI_WRITEMASK_Z:
717		tmp_src.SwizzleW = TGSI_SWIZZLE_Z;
718		break;
719	case TGSI_WRITEMASK_W:
720		tmp_src.SwizzleW = TGSI_SWIZZLE_W;
721		break;
722	default:
723		DBG("invalid writemask!");
724		assert(0);
725		break;
726	}
727
728	alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
729	add_regs_dummy_vector(alu);
730	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
731	add_src_reg(ctx, alu, &tmp_src);
732	add_scalar_clamp(inst, alu);
733}
734
735static void
736translate_tex(struct fd2_compile_context *ctx,
737		struct tgsi_full_instruction *inst, unsigned opc)
738{
739	struct ir2_instruction *instr;
740	struct ir2_register *reg;
741	struct tgsi_dst_register tmp_dst;
742	struct tgsi_src_register tmp_src;
743	const struct tgsi_src_register *coord;
744	bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
745			inst->Instruction.Saturate;
746	int idx;
747
748	if (using_temp || (opc == TGSI_OPCODE_TXP))
749		get_internal_temp(ctx, &tmp_dst, &tmp_src);
750
751	if (opc == TGSI_OPCODE_TXP) {
752		static const char *swiz[] = {
753				[TGSI_SWIZZLE_X] = "xxxx",
754				[TGSI_SWIZZLE_Y] = "yyyy",
755				[TGSI_SWIZZLE_Z] = "zzzz",
756				[TGSI_SWIZZLE_W] = "wwww",
757		};
758
759		/* TXP - Projective Texture Lookup:
760		 *
761		 *  coord.x = src0.x / src.w
762		 *  coord.y = src0.y / src.w
763		 *  coord.z = src0.z / src.w
764		 *  coord.w = src0.w
765		 *  bias = 0.0
766		 *
767		 *  dst = texture_sample(unit, coord, bias)
768		 */
769		instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
770
771		/* MAXv: */
772		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
773		add_src_reg(ctx, instr, &inst->Src[0].Register);
774		add_src_reg(ctx, instr, &inst->Src[0].Register);
775
776		/* RECIP_IEEE: */
777		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
778		add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
779				swiz[inst->Src[0].Register.SwizzleW];
780
781		instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
782		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
783		add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
784		add_src_reg(ctx, instr, &inst->Src[0].Register);
785
786		coord = &tmp_src;
787	} else {
788		coord = &inst->Src[0].Register;
789	}
790
791	instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
792	instr->fetch.opc = TEX_FETCH;
793	instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
794	assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases?
795
796	/* save off the tex fetch to be patched later with correct const_idx: */
797	idx = ctx->so->num_tfetch_instrs++;
798	ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index;
799	ctx->so->tfetch_instrs[idx].instr = instr;
800
801	add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register);
802	reg = add_src_reg(ctx, instr, coord);
803
804	/* blob compiler always sets 3rd component to same as 1st for 2d: */
805	if (inst->Texture.Texture == TGSI_TEXTURE_2D)
806		reg->swizzle[2] = reg->swizzle[0];
807
808	/* dst register needs to be marked for sync: */
809	ctx->need_sync |= 1 << instr->regs[0]->num;
810
811	/* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
812	instr->sync = true;
813
814	if (using_temp) {
815		/* texture fetch can't write directly to export, so if tgsi
816		 * is telling us the dst register is in output file, we load
817		 * the texture to a temp and the use ALU instruction to move
818		 * to output
819		 */
820		instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
821
822		add_dst_reg(ctx, instr, &inst->Dst[0].Register);
823		add_src_reg(ctx, instr, &tmp_src);
824		add_src_reg(ctx, instr, &tmp_src);
825		add_vector_clamp(inst, instr);
826	}
827}
828
829/* SGE(a,b) = GTE((b - a), 1.0, 0.0) */
830/* SLT(a,b) = GTE((b - a), 0.0, 1.0) */
831static void
832translate_sge_slt(struct fd2_compile_context *ctx,
833		struct tgsi_full_instruction *inst, unsigned opc)
834{
835	struct ir2_instruction *instr;
836	struct tgsi_dst_register tmp_dst;
837	struct tgsi_src_register tmp_src;
838	struct tgsi_src_register tmp_const;
839	float c0, c1;
840
841	switch (opc) {
842	default:
843		assert(0);
844	case TGSI_OPCODE_SGE:
845		c0 = 1.0;
846		c1 = 0.0;
847		break;
848	case TGSI_OPCODE_SLT:
849		c0 = 0.0;
850		c1 = 1.0;
851		break;
852	}
853
854	get_internal_temp(ctx, &tmp_dst, &tmp_src);
855
856	instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
857	add_dst_reg(ctx, instr, &tmp_dst);
858	add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
859	add_src_reg(ctx, instr, &inst->Src[1].Register);
860
861	instr = ir2_instr_create_alu(next_exec_cf(ctx), CNDGTEv, ~0);
862	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
863	/* maybe should re-arrange the syntax some day, but
864	 * in assembler/disassembler and what ir.c expects
865	 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
866	 */
867	get_immediate(ctx, &tmp_const, fui(c0));
868	add_src_reg(ctx, instr, &tmp_const);
869	add_src_reg(ctx, instr, &tmp_src);
870	get_immediate(ctx, &tmp_const, fui(c1));
871	add_src_reg(ctx, instr, &tmp_const);
872}
873
874/* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
875static void
876translate_lrp(struct fd2_compile_context *ctx,
877		struct tgsi_full_instruction *inst,
878		unsigned opc)
879{
880	struct ir2_instruction *instr;
881	struct tgsi_dst_register tmp_dst1, tmp_dst2;
882	struct tgsi_src_register tmp_src1, tmp_src2;
883	struct tgsi_src_register tmp_const;
884
885	get_internal_temp(ctx, &tmp_dst1, &tmp_src1);
886	get_internal_temp(ctx, &tmp_dst2, &tmp_src2);
887
888	get_immediate(ctx, &tmp_const, fui(1.0));
889
890	/* tmp1 = (a * b) */
891	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
892	add_dst_reg(ctx, instr, &tmp_dst1);
893	add_src_reg(ctx, instr, &inst->Src[0].Register);
894	add_src_reg(ctx, instr, &inst->Src[1].Register);
895
896	/* tmp2 = (1 - a) */
897	instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
898	add_dst_reg(ctx, instr, &tmp_dst2);
899	add_src_reg(ctx, instr, &tmp_const);
900	add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
901
902	/* tmp2 = tmp2 * c */
903	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
904	add_dst_reg(ctx, instr, &tmp_dst2);
905	add_src_reg(ctx, instr, &tmp_src2);
906	add_src_reg(ctx, instr, &inst->Src[2].Register);
907
908	/* dst = tmp1 + tmp2 */
909	instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
910	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
911	add_src_reg(ctx, instr, &tmp_src1);
912	add_src_reg(ctx, instr, &tmp_src2);
913}
914
915static void
916translate_trig(struct fd2_compile_context *ctx,
917		struct tgsi_full_instruction *inst,
918		unsigned opc)
919{
920	struct ir2_instruction *instr;
921	struct tgsi_dst_register tmp_dst;
922	struct tgsi_src_register tmp_src;
923	struct tgsi_src_register tmp_const;
924	instr_scalar_opc_t op;
925
926	switch (opc) {
927	default:
928		assert(0);
929	case TGSI_OPCODE_SIN:
930		op = SIN;
931		break;
932	case TGSI_OPCODE_COS:
933		op = COS;
934		break;
935	}
936
937	get_internal_temp(ctx, &tmp_dst, &tmp_src);
938
939	tmp_dst.WriteMask = TGSI_WRITEMASK_X;
940	tmp_src.SwizzleX = tmp_src.SwizzleY =
941			tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
942
943	/* maybe should re-arrange the syntax some day, but
944	 * in assembler/disassembler and what ir.c expects
945	 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
946	 */
947	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
948	add_dst_reg(ctx, instr, &tmp_dst);
949	get_immediate(ctx, &tmp_const, fui(0.5));
950	add_src_reg(ctx, instr, &tmp_const);
951	add_src_reg(ctx, instr, &inst->Src[0].Register);
952	get_immediate(ctx, &tmp_const, fui(0.159155));
953	add_src_reg(ctx, instr, &tmp_const);
954
955	instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
956	add_dst_reg(ctx, instr, &tmp_dst);
957	add_src_reg(ctx, instr, &tmp_src);
958	add_src_reg(ctx, instr, &tmp_src);
959
960	instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
961	add_dst_reg(ctx, instr, &tmp_dst);
962	get_immediate(ctx, &tmp_const, fui(-3.141593));
963	add_src_reg(ctx, instr, &tmp_const);
964	add_src_reg(ctx, instr, &tmp_src);
965	get_immediate(ctx, &tmp_const, fui(6.283185));
966	add_src_reg(ctx, instr, &tmp_const);
967
968	instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
969	add_regs_dummy_vector(instr);
970	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
971	add_src_reg(ctx, instr, &tmp_src);
972}
973
974/*
975 * Main part of compiler/translator:
976 */
977
978static void
979translate_instruction(struct fd2_compile_context *ctx,
980		struct tgsi_full_instruction *inst)
981{
982	unsigned opc = inst->Instruction.Opcode;
983	struct ir2_instruction *instr;
984	static struct ir2_cf *cf;
985
986	if (opc == TGSI_OPCODE_END)
987		return;
988
989	if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
990		unsigned num = inst->Dst[0].Register.Index;
991		/* seems like we need to ensure that position vs param/pixel
992		 * exports don't end up in the same EXEC clause..  easy way
993		 * to do this is force a new EXEC clause on first appearance
994		 * of an position or param/pixel export.
995		 */
996		if ((num == ctx->position) || (num == ctx->psize)) {
997			if (ctx->num_position > 0) {
998				ctx->cf = NULL;
999				ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
1000						ctx->num_position - 1);
1001				ctx->num_position = 0;
1002			}
1003		} else {
1004			if (ctx->num_param > 0) {
1005				ctx->cf = NULL;
1006				ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
1007						ctx->num_param - 1);
1008				ctx->num_param = 0;
1009			}
1010		}
1011	}
1012
1013	cf = next_exec_cf(ctx);
1014
1015	/* TODO turn this into a table: */
1016	switch (opc) {
1017	case TGSI_OPCODE_MOV:
1018		instr = ir2_instr_create_alu(cf, MAXv, ~0);
1019		add_regs_vector_1(ctx, inst, instr);
1020		break;
1021	case TGSI_OPCODE_RCP:
1022		instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
1023		add_regs_scalar_1(ctx, inst, instr);
1024		break;
1025	case TGSI_OPCODE_RSQ:
1026		instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
1027		add_regs_scalar_1(ctx, inst, instr);
1028		break;
1029	case TGSI_OPCODE_SQRT:
1030		instr = ir2_instr_create_alu(cf, ~0, SQRT_IEEE);
1031		add_regs_scalar_1(ctx, inst, instr);
1032		break;
1033	case TGSI_OPCODE_MUL:
1034		instr = ir2_instr_create_alu(cf, MULv, ~0);
1035		add_regs_vector_2(ctx, inst, instr);
1036		break;
1037	case TGSI_OPCODE_ADD:
1038		instr = ir2_instr_create_alu(cf, ADDv, ~0);
1039		add_regs_vector_2(ctx, inst, instr);
1040		break;
1041	case TGSI_OPCODE_DP3:
1042		instr = ir2_instr_create_alu(cf, DOT3v, ~0);
1043		add_regs_vector_2(ctx, inst, instr);
1044		break;
1045	case TGSI_OPCODE_DP4:
1046		instr = ir2_instr_create_alu(cf, DOT4v, ~0);
1047		add_regs_vector_2(ctx, inst, instr);
1048		break;
1049	case TGSI_OPCODE_MIN:
1050		instr = ir2_instr_create_alu(cf, MINv, ~0);
1051		add_regs_vector_2(ctx, inst, instr);
1052		break;
1053	case TGSI_OPCODE_MAX:
1054		instr = ir2_instr_create_alu(cf, MAXv, ~0);
1055		add_regs_vector_2(ctx, inst, instr);
1056		break;
1057	case TGSI_OPCODE_SLT:
1058	case TGSI_OPCODE_SGE:
1059		translate_sge_slt(ctx, inst, opc);
1060		break;
1061	case TGSI_OPCODE_MAD:
1062		instr = ir2_instr_create_alu(cf, MULADDv, ~0);
1063		add_regs_vector_3(ctx, inst, instr);
1064		break;
1065	case TGSI_OPCODE_LRP:
1066		translate_lrp(ctx, inst, opc);
1067		break;
1068	case TGSI_OPCODE_FRC:
1069		instr = ir2_instr_create_alu(cf, FRACv, ~0);
1070		add_regs_vector_1(ctx, inst, instr);
1071		break;
1072	case TGSI_OPCODE_FLR:
1073		instr = ir2_instr_create_alu(cf, FLOORv, ~0);
1074		add_regs_vector_1(ctx, inst, instr);
1075		break;
1076	case TGSI_OPCODE_EX2:
1077		instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
1078		add_regs_scalar_1(ctx, inst, instr);
1079		break;
1080	case TGSI_OPCODE_POW:
1081		translate_pow(ctx, inst);
1082		break;
1083	case TGSI_OPCODE_COS:
1084	case TGSI_OPCODE_SIN:
1085		translate_trig(ctx, inst, opc);
1086		break;
1087	case TGSI_OPCODE_TEX:
1088	case TGSI_OPCODE_TXP:
1089		translate_tex(ctx, inst, opc);
1090		break;
1091	case TGSI_OPCODE_CMP:
1092		instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
1093		add_regs_vector_3(ctx, inst, instr);
1094		// TODO this should be src0 if regs where in sane order..
1095		instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
1096		break;
1097	case TGSI_OPCODE_IF:
1098		push_predicate(ctx, &inst->Src[0].Register);
1099		ctx->so->ir->pred = IR2_PRED_EQ;
1100		break;
1101	case TGSI_OPCODE_ELSE:
1102		ctx->so->ir->pred = IR2_PRED_NE;
1103		/* not sure if this is required in all cases, but blob compiler
1104		 * won't combine EQ and NE in same CF:
1105		 */
1106		ctx->cf = NULL;
1107		break;
1108	case TGSI_OPCODE_ENDIF:
1109		pop_predicate(ctx);
1110		break;
1111	case TGSI_OPCODE_F2I:
1112		instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
1113		add_regs_vector_1(ctx, inst, instr);
1114		break;
1115	default:
1116		DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc));
1117		tgsi_dump(ctx->so->tokens, 0);
1118		assert(0);
1119		break;
1120	}
1121
1122	/* internal temporaries are only valid for the duration of a single
1123	 * TGSI instruction:
1124	 */
1125	ctx->num_internal_temps = 0;
1126}
1127
1128static void
1129compile_instructions(struct fd2_compile_context *ctx)
1130{
1131	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
1132		tgsi_parse_token(&ctx->parser);
1133
1134		switch (ctx->parser.FullToken.Token.Type) {
1135		case TGSI_TOKEN_TYPE_INSTRUCTION:
1136			translate_instruction(ctx,
1137					&ctx->parser.FullToken.FullInstruction);
1138			break;
1139		default:
1140			break;
1141		}
1142	}
1143
1144	ctx->cf->cf_type = EXEC_END;
1145}
1146
1147int
1148fd2_compile_shader(struct fd_program_stateobj *prog,
1149		struct fd2_shader_stateobj *so)
1150{
1151	struct fd2_compile_context ctx;
1152
1153	ir2_shader_destroy(so->ir);
1154	so->ir = ir2_shader_create();
1155	so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0;
1156
1157	if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK)
1158		return -1;
1159
1160	if (ctx.type == PIPE_SHADER_VERTEX) {
1161		compile_vtx_fetch(&ctx);
1162	} else if (ctx.type == PIPE_SHADER_FRAGMENT) {
1163		prog->num_exports = 0;
1164		memset(prog->export_linkage, 0xff,
1165				sizeof(prog->export_linkage));
1166	}
1167
1168	compile_instructions(&ctx);
1169
1170	compile_free(&ctx);
1171
1172	return 0;
1173}
1174
1175