ir3_compiler.c revision 547182977f5d893334cb630b974136c05a9461ab
1/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3/*
4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 *    Rob Clark <robclark@freedesktop.org>
27 */
28
29#include <stdarg.h>
30
31#include "pipe/p_state.h"
32#include "util/u_string.h"
33#include "util/u_memory.h"
34#include "util/u_inlines.h"
35#include "tgsi/tgsi_lowering.h"
36#include "tgsi/tgsi_parse.h"
37#include "tgsi/tgsi_ureg.h"
38#include "tgsi/tgsi_info.h"
39#include "tgsi/tgsi_strings.h"
40#include "tgsi/tgsi_dump.h"
41#include "tgsi/tgsi_scan.h"
42
43#include "freedreno_util.h"
44
45#include "ir3_compiler.h"
46#include "ir3_shader.h"
47
48#include "instr-a3xx.h"
49#include "ir3.h"
50
51struct ir3_compile_context {
52	const struct tgsi_token *tokens;
53	bool free_tokens;
54	struct ir3 *ir;
55	struct ir3_shader_variant *so;
56
57	struct ir3_block *block;
58	struct ir3_instruction *current_instr;
59
60	/* we need to defer updates to block->outputs[] until the end
61	 * of an instruction (so we don't see new value until *after*
62	 * the src registers are processed)
63	 */
64	struct {
65		struct ir3_instruction *instr, **instrp;
66	} output_updates[16];
67	unsigned num_output_updates;
68
69	/* are we in a sequence of "atomic" instructions?
70	 */
71	bool atomic;
72
73	/* For fragment shaders, from the hw perspective the only
74	 * actual input is r0.xy position register passed to bary.f.
75	 * But TGSI doesn't know that, it still declares things as
76	 * IN[] registers.  So we do all the input tracking normally
77	 * and fix things up after compile_instructions()
78	 *
79	 * NOTE that frag_pos is the hardware position (possibly it
80	 * is actually an index or tag or some such.. it is *not*
81	 * values that can be directly used for gl_FragCoord..)
82	 */
83	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
84
85	struct tgsi_parse_context parser;
86	unsigned type;
87
88	struct tgsi_shader_info info;
89
90	/* for calculating input/output positions/linkages: */
91	unsigned next_inloc;
92
93	unsigned num_internal_temps;
94	struct tgsi_src_register internal_temps[8];
95
96	/* idx/slot for last compiler generated immediate */
97	unsigned immediate_idx;
98
99	/* stack of branch instructions that mark (potentially nested)
100	 * branch if/else/loop/etc
101	 */
102	struct {
103		struct ir3_instruction *instr, *cond;
104		bool inv;   /* true iff in else leg of branch */
105	} branch[16];
106	unsigned int branch_count;
107
108	/* list of kill instructions: */
109	struct ir3_instruction *kill[16];
110	unsigned int kill_count;
111
112	/* used when dst is same as one of the src, to avoid overwriting a
113	 * src element before the remaining scalar instructions that make
114	 * up the vector operation
115	 */
116	struct tgsi_dst_register tmp_dst;
117	struct tgsi_src_register *tmp_src;
118
119	/* just for catching incorrect use of get_dst()/put_dst():
120	 */
121	bool using_tmp_dst;
122};
123
124
125static void vectorize(struct ir3_compile_context *ctx,
126		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
127		int nsrcs, ...);
128static void create_mov(struct ir3_compile_context *ctx,
129		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
130static type_t get_ftype(struct ir3_compile_context *ctx);
131
132static unsigned
133compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
134		const struct tgsi_token *tokens)
135{
136	unsigned ret;
137	struct tgsi_shader_info *info = &ctx->info;
138	struct tgsi_lowering_config lconfig = {
139			.color_two_side = so->key.color_two_side,
140			.lower_DST  = true,
141			.lower_XPD  = true,
142			.lower_SCS  = true,
143			.lower_LRP  = true,
144			.lower_FRC  = true,
145			.lower_POW  = true,
146			.lower_LIT  = true,
147			.lower_EXP  = true,
148			.lower_LOG  = true,
149			.lower_DP4  = true,
150			.lower_DP3  = true,
151			.lower_DPH  = true,
152			.lower_DP2  = true,
153			.lower_DP2A = true,
154	};
155
156	switch (so->type) {
157	case SHADER_FRAGMENT:
158	case SHADER_COMPUTE:
159		lconfig.saturate_s = so->key.fsaturate_s;
160		lconfig.saturate_t = so->key.fsaturate_t;
161		lconfig.saturate_r = so->key.fsaturate_r;
162		break;
163	case SHADER_VERTEX:
164		lconfig.saturate_s = so->key.vsaturate_s;
165		lconfig.saturate_t = so->key.vsaturate_t;
166		lconfig.saturate_r = so->key.vsaturate_r;
167		break;
168	}
169
170	ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
171	ctx->free_tokens = !!ctx->tokens;
172	if (!ctx->tokens) {
173		/* no lowering */
174		ctx->tokens = tokens;
175	}
176	ctx->ir = so->ir;
177	ctx->so = so;
178	ctx->next_inloc = 8;
179	ctx->num_internal_temps = 0;
180	ctx->branch_count = 0;
181	ctx->kill_count = 0;
182	ctx->block = NULL;
183	ctx->current_instr = NULL;
184	ctx->num_output_updates = 0;
185	ctx->atomic = false;
186	ctx->frag_pos = NULL;
187	ctx->frag_face = NULL;
188	ctx->tmp_src = NULL;
189	ctx->using_tmp_dst = false;
190
191	memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
192
193#define FM(x) (1 << TGSI_FILE_##x)
194	/* optimize can't deal with relative addressing: */
195	if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
196		return TGSI_PARSE_ERROR;
197
198	/* NOTE: if relative addressing is used, we set constlen in
199	 * the compiler (to worst-case value) since we don't know in
200	 * the assembler what the max addr reg value can be:
201	 */
202	if (info->indirect_files & FM(CONSTANT))
203		so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1);
204
205	/* Immediates go after constants: */
206	so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
207	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
208
209	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
210	if (ret != TGSI_PARSE_OK)
211		return ret;
212
213	ctx->type = ctx->parser.FullHeader.Processor.Processor;
214
215	return ret;
216}
217
218static void
219compile_error(struct ir3_compile_context *ctx, const char *format, ...)
220{
221	va_list ap;
222	va_start(ap, format);
223	_debug_vprintf(format, ap);
224	va_end(ap);
225	tgsi_dump(ctx->tokens, 0);
226	debug_assert(0);
227}
228
229#define compile_assert(ctx, cond) do { \
230		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
231	} while (0)
232
233static void
234compile_free(struct ir3_compile_context *ctx)
235{
236	if (ctx->free_tokens)
237		free((void *)ctx->tokens);
238	tgsi_parse_free(&ctx->parser);
239}
240
241struct instr_translater {
242	void (*fxn)(const struct instr_translater *t,
243			struct ir3_compile_context *ctx,
244			struct tgsi_full_instruction *inst);
245	unsigned tgsi_opc;
246	opc_t opc;
247	opc_t hopc;    /* opc to use for half_precision mode, if different */
248	unsigned arg;
249};
250
251static void
252instr_finish(struct ir3_compile_context *ctx)
253{
254	unsigned i;
255
256	if (ctx->atomic)
257		return;
258
259	for (i = 0; i < ctx->num_output_updates; i++)
260		*(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
261
262	ctx->num_output_updates = 0;
263}
264
265/* For "atomic" groups of instructions, for example the four scalar
266 * instructions to perform a vec4 operation.  Basically this just
267 * blocks out handling of output_updates so the next scalar instruction
268 * still sees the result from before the start of the atomic group.
269 *
270 * NOTE: when used properly, this could probably replace get/put_dst()
271 * stuff.
272 */
273static void
274instr_atomic_start(struct ir3_compile_context *ctx)
275{
276	ctx->atomic = true;
277}
278
279static void
280instr_atomic_end(struct ir3_compile_context *ctx)
281{
282	ctx->atomic = false;
283	instr_finish(ctx);
284}
285
286static struct ir3_instruction *
287instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
288{
289	instr_finish(ctx);
290	return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
291}
292
293static struct ir3_instruction *
294instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
295{
296	instr_finish(ctx);
297	return (ctx->current_instr = ir3_instr_clone(instr));
298}
299
300static struct ir3_block *
301push_block(struct ir3_compile_context *ctx)
302{
303	struct ir3_block *block;
304	unsigned ntmp, nin, nout;
305
306#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
307
308	/* hmm, give ourselves room to create 8 extra temporaries (vec4):
309	 */
310	ntmp = SCALAR_REGS(TEMPORARY);
311	ntmp += 8 * 4;
312
313	nout = SCALAR_REGS(OUTPUT);
314	nin  = SCALAR_REGS(INPUT);
315
316	/* for outermost block, 'inputs' are the actual shader INPUT
317	 * register file.  Reads from INPUT registers always go back to
318	 * top block.  For nested blocks, 'inputs' is used to track any
319	 * TEMPORARY file register from one of the enclosing blocks that
320	 * is ready in this block.
321	 */
322	if (!ctx->block) {
323		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
324		 * position)
325		 */
326		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
327			int n = 2;
328			if (ctx->info.reads_position)
329				n += 4;
330			if (ctx->info.uses_frontface)
331				n += 4;
332			nin = MAX2(n, nin);
333			nout += ARRAY_SIZE(ctx->kill);
334		}
335	} else {
336		nin = ntmp;
337	}
338
339	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
340
341	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
342		block->noutputs -= ARRAY_SIZE(ctx->kill);
343
344	block->parent = ctx->block;
345	ctx->block = block;
346
347	return block;
348}
349
350static void
351pop_block(struct ir3_compile_context *ctx)
352{
353	ctx->block = ctx->block->parent;
354	compile_assert(ctx, ctx->block);
355}
356
357static struct ir3_instruction *
358create_output(struct ir3_block *block, struct ir3_instruction *instr,
359		unsigned n)
360{
361	struct ir3_instruction *out;
362
363	out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
364	out->inout.block = block;
365	ir3_reg_create(out, n, 0);
366	if (instr)
367		ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
368
369	return out;
370}
371
372static struct ir3_instruction *
373create_input(struct ir3_block *block, struct ir3_instruction *instr,
374		unsigned n)
375{
376	struct ir3_instruction *in;
377
378	in = ir3_instr_create(block, -1, OPC_META_INPUT);
379	in->inout.block = block;
380	ir3_reg_create(in, n, 0);
381	if (instr)
382		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
383
384	return in;
385}
386
387static struct ir3_instruction *
388block_input(struct ir3_block *block, unsigned n)
389{
390	/* references to INPUT register file always go back up to
391	 * top level:
392	 */
393	if (block->parent)
394		return block_input(block->parent, n);
395	return block->inputs[n];
396}
397
398/* return temporary in scope, creating if needed meta-input node
399 * to track block inputs
400 */
401static struct ir3_instruction *
402block_temporary(struct ir3_block *block, unsigned n)
403{
404	/* references to TEMPORARY register file, find the nearest
405	 * enclosing block which has already assigned this temporary,
406	 * creating meta-input instructions along the way to keep
407	 * track of block inputs
408	 */
409	if (block->parent && !block->temporaries[n]) {
410		/* if already have input for this block, reuse: */
411		if (!block->inputs[n])
412			block->inputs[n] = block_temporary(block->parent, n);
413
414		/* and create new input to return: */
415		return create_input(block, block->inputs[n], n);
416	}
417	return block->temporaries[n];
418}
419
420static struct ir3_instruction *
421create_immed(struct ir3_compile_context *ctx, float val)
422{
423	/* NOTE: *don't* use instr_create() here!
424	 */
425	struct ir3_instruction *instr;
426	instr = ir3_instr_create(ctx->block, 1, 0);
427	instr->cat1.src_type = get_ftype(ctx);
428	instr->cat1.dst_type = get_ftype(ctx);
429	ir3_reg_create(instr, 0, 0);
430	ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
431	return instr;
432}
433
434static void
435ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
436		const struct tgsi_dst_register *dst, unsigned chan)
437{
438	unsigned n = regid(dst->Index, chan);
439	unsigned idx = ctx->num_output_updates;
440
441	compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
442
443	/* NOTE: defer update of temporaries[idx] or output[idx]
444	 * until instr_finish(), so that if the current instruction
445	 * reads the same TEMP/OUT[] it gets the old value:
446	 *
447	 * bleh.. this might be a bit easier to just figure out
448	 * in instr_finish().  But at that point we've already
449	 * lost information about OUTPUT vs TEMPORARY register
450	 * file..
451	 */
452
453	switch (dst->File) {
454	case TGSI_FILE_OUTPUT:
455		compile_assert(ctx, n < ctx->block->noutputs);
456		ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
457		ctx->output_updates[idx].instr = instr;
458		ctx->num_output_updates++;
459		break;
460	case TGSI_FILE_TEMPORARY:
461		compile_assert(ctx, n < ctx->block->ntemporaries);
462		ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
463		ctx->output_updates[idx].instr = instr;
464		ctx->num_output_updates++;
465		break;
466	case TGSI_FILE_ADDRESS:
467		compile_assert(ctx, n < 1);
468		ctx->output_updates[idx].instrp = &ctx->block->address;
469		ctx->output_updates[idx].instr = instr;
470		ctx->num_output_updates++;
471		break;
472	}
473}
474
475static void
476ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
477		const struct tgsi_src_register *src, unsigned chan)
478{
479	struct ir3_block *block = ctx->block;
480	unsigned n = regid(src->Index, chan);
481
482	switch (src->File) {
483	case TGSI_FILE_INPUT:
484		reg->flags |= IR3_REG_SSA;
485		reg->instr = block_input(ctx->block, n);
486		break;
487	case TGSI_FILE_OUTPUT:
488		/* really this should just happen in case of 'MOV_SAT OUT[n], ..',
489		 * for the following clamp instructions:
490		 */
491		reg->flags |= IR3_REG_SSA;
492		reg->instr = block->outputs[n];
493		/* we don't have to worry about read from an OUTPUT that was
494		 * assigned outside of the current block, because the _SAT
495		 * clamp instructions will always be in the same block as
496		 * the original instruction which wrote the OUTPUT
497		 */
498		compile_assert(ctx, reg->instr);
499		break;
500	case TGSI_FILE_TEMPORARY:
501		reg->flags |= IR3_REG_SSA;
502		reg->instr = block_temporary(ctx->block, n);
503		break;
504	}
505
506	if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
507		/* this can happen when registers (or components of a TGSI
508		 * register) are used as src before they have been assigned
509		 * (undefined contents).  To avoid confusing the rest of the
510		 * compiler, and to generally keep things peachy, substitute
511		 * an instruction that sets the src to 0.0.  Or to keep
512		 * things undefined, I could plug in a random number? :-P
513		 *
514		 * NOTE: *don't* use instr_create() here!
515		 */
516		reg->instr = create_immed(ctx, 0.0);
517	}
518}
519
520static struct ir3_register *
521add_dst_reg_wrmask(struct ir3_compile_context *ctx,
522		struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
523		unsigned chan, unsigned wrmask)
524{
525	unsigned flags = 0, num = 0;
526	struct ir3_register *reg;
527
528	switch (dst->File) {
529	case TGSI_FILE_OUTPUT:
530	case TGSI_FILE_TEMPORARY:
531		/* uses SSA */
532		break;
533	case TGSI_FILE_ADDRESS:
534		flags |= IR3_REG_ADDR;
535		/* uses SSA */
536		break;
537	default:
538		compile_error(ctx, "unsupported dst register file: %s\n",
539			tgsi_file_name(dst->File));
540		break;
541	}
542
543	if (dst->Indirect)
544		flags |= IR3_REG_RELATIV;
545
546	reg = ir3_reg_create(instr, regid(num, chan), flags);
547
548	/* NOTE: do not call ssa_dst() if atomic.. vectorize()
549	 * itself will call ssa_dst().  This is to filter out
550	 * the (initially bogus) .x component dst which is
551	 * created (but not necessarily used, ie. if the net
552	 * result of the vector operation does not write to
553	 * the .x component)
554	 */
555
556	reg->wrmask = wrmask;
557	if (wrmask == 0x1) {
558		/* normal case */
559		if (!ctx->atomic)
560			ssa_dst(ctx, instr, dst, chan);
561	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
562			(dst->File == TGSI_FILE_OUTPUT) ||
563			(dst->File == TGSI_FILE_ADDRESS)) {
564		unsigned i;
565
566		/* if instruction writes multiple, we need to create
567		 * some place-holder collect the registers:
568		 */
569		for (i = 0; i < 4; i++) {
570			if (wrmask & (1 << i)) {
571				struct ir3_instruction *collect =
572						ir3_instr_create(ctx->block, -1, OPC_META_FO);
573				collect->fo.off = i;
574				/* unused dst reg: */
575				ir3_reg_create(collect, 0, 0);
576				/* and src reg used to hold original instr */
577				ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
578				if (!ctx->atomic)
579					ssa_dst(ctx, collect, dst, chan+i);
580			}
581		}
582	}
583
584	return reg;
585}
586
587static struct ir3_register *
588add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
589		const struct tgsi_dst_register *dst, unsigned chan)
590{
591	return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
592}
593
594static struct ir3_register *
595add_src_reg_wrmask(struct ir3_compile_context *ctx,
596		struct ir3_instruction *instr, const struct tgsi_src_register *src,
597		unsigned chan, unsigned wrmask)
598{
599	unsigned flags = 0, num = 0;
600	struct ir3_register *reg;
601	struct ir3_instruction *orig = NULL;
602
603	switch (src->File) {
604	case TGSI_FILE_IMMEDIATE:
605		/* TODO if possible, use actual immediate instead of const.. but
606		 * TGSI has vec4 immediates, we can only embed scalar (of limited
607		 * size, depending on instruction..)
608		 */
609		flags |= IR3_REG_CONST;
610		num = src->Index + ctx->so->first_immediate;
611		break;
612	case TGSI_FILE_CONSTANT:
613		flags |= IR3_REG_CONST;
614		num = src->Index;
615		break;
616	case TGSI_FILE_OUTPUT:
617		/* NOTE: we should only end up w/ OUTPUT file for things like
618		 * clamp()'ing saturated dst instructions
619		 */
620	case TGSI_FILE_INPUT:
621	case TGSI_FILE_TEMPORARY:
622		/* uses SSA */
623		break;
624	default:
625		compile_error(ctx, "unsupported src register file: %s\n",
626			tgsi_file_name(src->File));
627		break;
628	}
629
630	/* We seem to have 8 bits (6.2) for dst register always, so I think
631	 * it is safe to assume GPR cannot be >=64
632	 *
633	 * cat3 instructions only have 8 bits for src2, but cannot take a
634	 * const for src2
635	 *
636	 * cat5 and cat6 in some cases only has 8 bits, but cannot take a
637	 * const for any src.
638	 *
639	 * Other than that we seem to have 12 bits to encode const src,
640	 * except for cat1 which may only have 11 bits (but that seems like
641	 * a bug)
642	 */
643	if (flags & IR3_REG_CONST)
644		compile_assert(ctx, src->Index < (1 << 9));
645	else
646		compile_assert(ctx, src->Index < (1 << 6));
647
648	if (src->Absolute)
649		flags |= IR3_REG_ABS;
650	if (src->Negate)
651		flags |= IR3_REG_NEGATE;
652
653	if (src->Indirect) {
654		flags |= IR3_REG_RELATIV;
655
656		/* shouldn't happen, and we can't cope with it below: */
657		compile_assert(ctx, wrmask == 0x1);
658
659		/* wrap in a meta-deref to track both the src and address: */
660		orig = instr;
661
662		instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
663		ir3_reg_create(instr, 0, 0);
664		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
665	}
666
667	reg = ir3_reg_create(instr, regid(num, chan), flags);
668
669	reg->wrmask = wrmask;
670	if (wrmask == 0x1) {
671		/* normal case */
672		ssa_src(ctx, reg, src, chan);
673	} else if ((src->File == TGSI_FILE_TEMPORARY) ||
674			(src->File == TGSI_FILE_OUTPUT) ||
675			(src->File == TGSI_FILE_INPUT)) {
676		struct ir3_instruction *collect;
677		unsigned i;
678
679		compile_assert(ctx, !src->Indirect);
680
681		/* if instruction reads multiple, we need to create
682		 * some place-holder collect the registers:
683		 */
684		collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
685		ir3_reg_create(collect, 0, 0);   /* unused dst reg */
686
687		for (i = 0; i < 4; i++) {
688			if (wrmask & (1 << i)) {
689				/* and src reg used point to the original instr */
690				ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
691						src, chan + i);
692			} else if (wrmask & ~((i << i) - 1)) {
693				/* if any remaining components, then dummy
694				 * placeholder src reg to fill in the blanks:
695				 */
696				ir3_reg_create(collect, 0, 0);
697			}
698		}
699
700		reg->flags |= IR3_REG_SSA;
701		reg->instr = collect;
702	}
703
704	if (src->Indirect) {
705		reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
706		reg->instr = instr;
707	}
708	return reg;
709}
710
711static struct ir3_register *
712add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
713		const struct tgsi_src_register *src, unsigned chan)
714{
715	return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
716}
717
718static void
719src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
720{
721	src->File      = dst->File;
722	src->Indirect  = dst->Indirect;
723	src->Dimension = dst->Dimension;
724	src->Index     = dst->Index;
725	src->Absolute  = 0;
726	src->Negate    = 0;
727	src->SwizzleX  = TGSI_SWIZZLE_X;
728	src->SwizzleY  = TGSI_SWIZZLE_Y;
729	src->SwizzleZ  = TGSI_SWIZZLE_Z;
730	src->SwizzleW  = TGSI_SWIZZLE_W;
731}
732
733/* Get internal-temp src/dst to use for a sequence of instructions
734 * generated by a single TGSI op.
735 */
736static struct tgsi_src_register *
737get_internal_temp(struct ir3_compile_context *ctx,
738		struct tgsi_dst_register *tmp_dst)
739{
740	struct tgsi_src_register *tmp_src;
741	int n;
742
743	tmp_dst->File      = TGSI_FILE_TEMPORARY;
744	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
745	tmp_dst->Indirect  = 0;
746	tmp_dst->Dimension = 0;
747
748	/* assign next temporary: */
749	n = ctx->num_internal_temps++;
750	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
751	tmp_src = &ctx->internal_temps[n];
752
753	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
754
755	src_from_dst(tmp_src, tmp_dst);
756
757	return tmp_src;
758}
759
760static inline bool
761is_const(struct tgsi_src_register *src)
762{
763	return (src->File == TGSI_FILE_CONSTANT) ||
764			(src->File == TGSI_FILE_IMMEDIATE);
765}
766
767static inline bool
768is_relative(struct tgsi_src_register *src)
769{
770	return src->Indirect;
771}
772
773static inline bool
774is_rel_or_const(struct tgsi_src_register *src)
775{
776	return is_relative(src) || is_const(src);
777}
778
779static type_t
780get_ftype(struct ir3_compile_context *ctx)
781{
782	return TYPE_F32;
783}
784
785static type_t
786get_utype(struct ir3_compile_context *ctx)
787{
788	return TYPE_U32;
789}
790
791static type_t
792get_stype(struct ir3_compile_context *ctx)
793{
794	return TYPE_S32;
795}
796
797static unsigned
798src_swiz(struct tgsi_src_register *src, int chan)
799{
800	switch (chan) {
801	case 0: return src->SwizzleX;
802	case 1: return src->SwizzleY;
803	case 2: return src->SwizzleZ;
804	case 3: return src->SwizzleW;
805	}
806	assert(0);
807	return 0;
808}
809
810/* for instructions that cannot take a const register as src, if needed
811 * generate a move to temporary gpr:
812 */
813static struct tgsi_src_register *
814get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
815{
816	struct tgsi_dst_register tmp_dst;
817	struct tgsi_src_register *tmp_src;
818
819	compile_assert(ctx, is_rel_or_const(src));
820
821	tmp_src = get_internal_temp(ctx, &tmp_dst);
822
823	create_mov(ctx, &tmp_dst, src);
824
825	return tmp_src;
826}
827
828static void
829get_immediate(struct ir3_compile_context *ctx,
830		struct tgsi_src_register *reg, uint32_t val)
831{
832	unsigned neg, swiz, idx, i;
833	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
834	static const unsigned swiz2tgsi[] = {
835			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
836	};
837
838	for (i = 0; i < ctx->immediate_idx; i++) {
839		swiz = i % 4;
840		idx  = i / 4;
841
842		if (ctx->so->immediates[idx].val[swiz] == val) {
843			neg = 0;
844			break;
845		}
846
847		if (ctx->so->immediates[idx].val[swiz] == -val) {
848			neg = 1;
849			break;
850		}
851	}
852
853	if (i == ctx->immediate_idx) {
854		/* need to generate a new immediate: */
855		swiz = i % 4;
856		idx  = i / 4;
857		neg  = 0;
858		ctx->so->immediates[idx].val[swiz] = val;
859		ctx->so->immediates_count = idx + 1;
860		ctx->immediate_idx++;
861	}
862
863	reg->File      = TGSI_FILE_IMMEDIATE;
864	reg->Indirect  = 0;
865	reg->Dimension = 0;
866	reg->Index     = idx;
867	reg->Absolute  = 0;
868	reg->Negate    = neg;
869	reg->SwizzleX  = swiz2tgsi[swiz];
870	reg->SwizzleY  = swiz2tgsi[swiz];
871	reg->SwizzleZ  = swiz2tgsi[swiz];
872	reg->SwizzleW  = swiz2tgsi[swiz];
873}
874
875static void
876create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
877		struct tgsi_src_register *src)
878{
879	type_t type_mov = get_ftype(ctx);
880	unsigned i;
881
882	for (i = 0; i < 4; i++) {
883		/* move to destination: */
884		if (dst->WriteMask & (1 << i)) {
885			struct ir3_instruction *instr;
886
887			if (src->Absolute || src->Negate) {
888				/* can't have abs or neg on a mov instr, so use
889				 * absneg.f instead to handle these cases:
890				 */
891				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
892			} else {
893				instr = instr_create(ctx, 1, 0);
894				instr->cat1.src_type = type_mov;
895				instr->cat1.dst_type = type_mov;
896			}
897
898			add_dst_reg(ctx, instr, dst, i);
899			add_src_reg(ctx, instr, src, src_swiz(src, i));
900		}
901	}
902}
903
904static void
905create_clamp(struct ir3_compile_context *ctx,
906		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
907		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
908{
909	struct ir3_instruction *instr;
910
911	instr = instr_create(ctx, 2, OPC_MAX_F);
912	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
913
914	instr = instr_create(ctx, 2, OPC_MIN_F);
915	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
916}
917
918static void
919create_clamp_imm(struct ir3_compile_context *ctx,
920		struct tgsi_dst_register *dst,
921		uint32_t minval, uint32_t maxval)
922{
923	struct tgsi_src_register minconst, maxconst;
924	struct tgsi_src_register src;
925
926	src_from_dst(&src, dst);
927
928	get_immediate(ctx, &minconst, minval);
929	get_immediate(ctx, &maxconst, maxval);
930
931	create_clamp(ctx, dst, &src, &minconst, &maxconst);
932}
933
934static struct tgsi_dst_register *
935get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
936{
937	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
938	unsigned i;
939
940	compile_assert(ctx, !ctx->using_tmp_dst);
941	ctx->using_tmp_dst = true;
942
943	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
944		struct tgsi_src_register *src = &inst->Src[i].Register;
945		if ((src->File == dst->File) && (src->Index == dst->Index)) {
946			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
947					(src->SwizzleX == TGSI_SWIZZLE_X) &&
948					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
949					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
950					(src->SwizzleW == TGSI_SWIZZLE_W))
951				continue;
952			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
953			ctx->tmp_dst.WriteMask = dst->WriteMask;
954			dst = &ctx->tmp_dst;
955			break;
956		}
957	}
958	return dst;
959}
960
961static void
962put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
963		struct tgsi_dst_register *dst)
964{
965	compile_assert(ctx, ctx->using_tmp_dst);
966	ctx->using_tmp_dst = false;
967
968	/* if necessary, add mov back into original dst: */
969	if (dst != &inst->Dst[0].Register) {
970		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
971	}
972}
973
974/* helper to generate the necessary repeat and/or additional instructions
975 * to turn a scalar instruction into a vector operation:
976 */
977static void
978vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
979		struct tgsi_dst_register *dst, int nsrcs, ...)
980{
981	va_list ap;
982	int i, j, n = 0;
983
984	instr_atomic_start(ctx);
985
986	add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
987
988	va_start(ap, nsrcs);
989	for (j = 0; j < nsrcs; j++) {
990		struct tgsi_src_register *src =
991				va_arg(ap, struct tgsi_src_register *);
992		unsigned flags = va_arg(ap, unsigned);
993		struct ir3_register *reg;
994		if (flags & IR3_REG_IMMED) {
995			reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
996			/* this is an ugly cast.. should have put flags first! */
997			reg->iim_val = *(int *)&src;
998		} else {
999			reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
1000		}
1001		reg->flags |= flags & ~IR3_REG_NEGATE;
1002		if (flags & IR3_REG_NEGATE)
1003			reg->flags ^= IR3_REG_NEGATE;
1004	}
1005	va_end(ap);
1006
1007	for (i = 0; i < 4; i++) {
1008		if (dst->WriteMask & (1 << i)) {
1009			struct ir3_instruction *cur;
1010
1011			if (n++ == 0) {
1012				cur = instr;
1013			} else {
1014				cur = instr_clone(ctx, instr);
1015			}
1016
1017			ssa_dst(ctx, cur, dst, i);
1018
1019			/* fix-up dst register component: */
1020			cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
1021
1022			/* fix-up src register component: */
1023			va_start(ap, nsrcs);
1024			for (j = 0; j < nsrcs; j++) {
1025				struct ir3_register *reg = cur->regs[j+1];
1026				struct tgsi_src_register *src =
1027						va_arg(ap, struct tgsi_src_register *);
1028				unsigned flags = va_arg(ap, unsigned);
1029				if (reg->flags & IR3_REG_SSA) {
1030					ssa_src(ctx, reg, src, src_swiz(src, i));
1031				} else if (!(flags & IR3_REG_IMMED)) {
1032					reg->num = regid(reg->num >> 2, src_swiz(src, i));
1033				}
1034			}
1035			va_end(ap);
1036		}
1037	}
1038
1039	instr_atomic_end(ctx);
1040}
1041
1042/*
1043 * Handlers for TGSI instructions which do not have a 1:1 mapping to
1044 * native instructions:
1045 */
1046
1047static void
1048trans_clamp(const struct instr_translater *t,
1049		struct ir3_compile_context *ctx,
1050		struct tgsi_full_instruction *inst)
1051{
1052	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1053	struct tgsi_src_register *src0 = &inst->Src[0].Register;
1054	struct tgsi_src_register *src1 = &inst->Src[1].Register;
1055	struct tgsi_src_register *src2 = &inst->Src[2].Register;
1056
1057	create_clamp(ctx, dst, src0, src1, src2);
1058
1059	put_dst(ctx, inst, dst);
1060}
1061
1062/* ARL(x) = x, but mova from hrN.x to a0.. */
1063static void
1064trans_arl(const struct instr_translater *t,
1065		struct ir3_compile_context *ctx,
1066		struct tgsi_full_instruction *inst)
1067{
1068	struct ir3_instruction *instr;
1069	struct tgsi_dst_register tmp_dst;
1070	struct tgsi_src_register *tmp_src;
1071	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1072	struct tgsi_src_register *src = &inst->Src[0].Register;
1073	unsigned chan = src->SwizzleX;
1074
1075	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1076
1077	/* NOTE: we allocate a temporary from a flat register
1078	 * namespace (ignoring half vs full).  It turns out
1079	 * not to really matter since registers get reassigned
1080	 * later in ir3_ra which (hopefully!) can deal a bit
1081	 * better with mixed half and full precision.
1082	 */
1083	tmp_src = get_internal_temp(ctx, &tmp_dst);
1084
1085	/* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
1086	instr = instr_create(ctx, 1, 0);
1087	instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
1088			get_ftype(ctx) : get_utype(ctx);
1089	instr->cat1.dst_type = TYPE_S16;
1090	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1091	add_src_reg(ctx, instr, src, chan);
1092
1093	/* shl.b Rtmp, Rtmp, 2 */
1094	instr = instr_create(ctx, 2, OPC_SHL_B);
1095	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1096	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1097	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1098
1099	/* mova a0, Rtmp */
1100	instr = instr_create(ctx, 1, 0);
1101	instr->cat1.src_type = TYPE_S16;
1102	instr->cat1.dst_type = TYPE_S16;
1103	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1104	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1105}
1106
1107/*
1108 * texture fetch/sample instructions:
1109 */
1110
1111struct tex_info {
1112	int8_t order[4];
1113	int8_t args;
1114	unsigned src_wrmask, flags;
1115};
1116
1117struct target_info {
1118	uint8_t dims;
1119	uint8_t cube;
1120	uint8_t array;
1121	uint8_t shadow;
1122};
1123
1124static const struct target_info tex_targets[] = {
1125	[TGSI_TEXTURE_1D]               = { 1, 0, 0, 0 },
1126	[TGSI_TEXTURE_2D]               = { 2, 0, 0, 0 },
1127	[TGSI_TEXTURE_3D]               = { 3, 0, 0, 0 },
1128	[TGSI_TEXTURE_CUBE]             = { 3, 1, 0, 0 },
1129	[TGSI_TEXTURE_RECT]             = { 2, 0, 0, 0 },
1130	[TGSI_TEXTURE_SHADOW1D]         = { 1, 0, 0, 1 },
1131	[TGSI_TEXTURE_SHADOW2D]         = { 2, 0, 0, 1 },
1132	[TGSI_TEXTURE_SHADOWRECT]       = { 2, 0, 0, 1 },
1133	[TGSI_TEXTURE_1D_ARRAY]         = { 1, 0, 1, 0 },
1134	[TGSI_TEXTURE_2D_ARRAY]         = { 2, 0, 1, 0 },
1135	[TGSI_TEXTURE_SHADOW1D_ARRAY]   = { 1, 0, 1, 1 },
1136	[TGSI_TEXTURE_SHADOW2D_ARRAY]   = { 2, 0, 1, 1 },
1137	[TGSI_TEXTURE_SHADOWCUBE]       = { 3, 1, 0, 1 },
1138	[TGSI_TEXTURE_2D_MSAA]          = { 2, 0, 0, 0 },
1139	[TGSI_TEXTURE_2D_ARRAY_MSAA]    = { 2, 0, 1, 0 },
1140	[TGSI_TEXTURE_CUBE_ARRAY]       = { 3, 1, 1, 0 },
1141	[TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
1142};
1143
1144static void
1145fill_tex_info(struct ir3_compile_context *ctx,
1146			  struct tgsi_full_instruction *inst,
1147			  struct tex_info *info)
1148{
1149	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1150
1151	if (tgt->dims == 3)
1152		info->flags |= IR3_INSTR_3D;
1153	if (tgt->array)
1154		info->flags |= IR3_INSTR_A;
1155	if (tgt->shadow)
1156		info->flags |= IR3_INSTR_S;
1157
1158	switch (inst->Instruction.Opcode) {
1159	case TGSI_OPCODE_TXB:
1160	case TGSI_OPCODE_TXB2:
1161	case TGSI_OPCODE_TXL:
1162	case TGSI_OPCODE_TXF:
1163		info->args = 2;
1164		break;
1165	case TGSI_OPCODE_TXP:
1166		info->flags |= IR3_INSTR_P;
1167		/* fallthrough */
1168	case TGSI_OPCODE_TEX:
1169	case TGSI_OPCODE_TXD:
1170		info->args = 1;
1171		break;
1172	}
1173
1174	/*
1175	 * lay out the first argument in the proper order:
1176	 *  - actual coordinates first
1177	 *  - array index
1178	 *  - shadow reference
1179	 *  - projection w
1180	 *
1181	 * bias/lod go into the second arg
1182	 */
1183	int arg, pos = 0;
1184	for (arg = 0; arg < tgt->dims; arg++)
1185		info->order[arg] = pos++;
1186	if (tgt->dims == 1)
1187		info->order[pos++] = -1;
1188	if (tgt->shadow)
1189		info->order[pos++] = MAX2(arg + tgt->array, 2);
1190	if (tgt->array)
1191		info->order[pos++] = arg++;
1192	if (info->flags & IR3_INSTR_P)
1193		info->order[pos++] = 3;
1194
1195	info->src_wrmask = (1 << pos) - 1;
1196
1197	for (; pos < 4; pos++)
1198		info->order[pos] = -1;
1199
1200	assert(pos <= 4);
1201}
1202
1203static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
1204{
1205	unsigned i;
1206	for (i = 1; (i < 4) && order[i] >= 0; i++)
1207		if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
1208			return false;
1209	return true;
1210}
1211
1212static bool is_1d(unsigned tex)
1213{
1214	return tex_targets[tex].dims == 1;
1215}
1216
1217static struct tgsi_src_register *
1218get_tex_coord(struct ir3_compile_context *ctx,
1219		struct tgsi_full_instruction *inst,
1220		const struct tex_info *tinf)
1221{
1222	struct tgsi_src_register *coord = &inst->Src[0].Register;
1223	struct ir3_instruction *instr;
1224	unsigned tex = inst->Texture.Texture;
1225	struct tgsi_dst_register tmp_dst;
1226	struct tgsi_src_register *tmp_src;
1227	type_t type_mov = get_ftype(ctx);
1228	unsigned j;
1229
1230	/* need to move things around: */
1231	tmp_src = get_internal_temp(ctx, &tmp_dst);
1232
1233	for (j = 0; j < 4; j++) {
1234		if (tinf->order[j] < 0)
1235			continue;
1236		instr = instr_create(ctx, 1, 0);  /* mov */
1237		instr->cat1.src_type = type_mov;
1238		instr->cat1.dst_type = type_mov;
1239		add_dst_reg(ctx, instr, &tmp_dst, j);
1240		add_src_reg(ctx, instr, coord,
1241				src_swiz(coord, tinf->order[j]));
1242	}
1243
1244	/* fix up .y coord: */
1245	if (is_1d(tex)) {
1246		struct ir3_register *imm;
1247		instr = instr_create(ctx, 1, 0);  /* mov */
1248		instr->cat1.src_type = type_mov;
1249		instr->cat1.dst_type = type_mov;
1250		add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
1251		imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
1252		if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
1253			imm->iim_val = 0;
1254		else
1255			imm->fim_val = 0.5;
1256	}
1257
1258	return tmp_src;
1259}
1260
1261static void
1262trans_samp(const struct instr_translater *t,
1263		struct ir3_compile_context *ctx,
1264		struct tgsi_full_instruction *inst)
1265{
1266	struct ir3_instruction *instr, *collect;
1267	struct ir3_register *reg;
1268	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1269	struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
1270	struct tgsi_src_register zero;
1271	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1272	struct tex_info tinf;
1273	int i;
1274
1275	memset(&tinf, 0, sizeof(tinf));
1276	fill_tex_info(ctx, inst, &tinf);
1277	coord = get_tex_coord(ctx, inst, &tinf);
1278	get_immediate(ctx, &zero, 0);
1279
1280	switch (inst->Instruction.Opcode) {
1281	case TGSI_OPCODE_TXB2:
1282		orig = &inst->Src[1].Register;
1283		samp = &inst->Src[2].Register;
1284		break;
1285	case TGSI_OPCODE_TXD:
1286		orig = &inst->Src[0].Register;
1287		dpdx = &inst->Src[1].Register;
1288		dpdy = &inst->Src[2].Register;
1289		samp = &inst->Src[3].Register;
1290		if (is_rel_or_const(dpdx))
1291				dpdx = get_unconst(ctx, dpdx);
1292		if (is_rel_or_const(dpdy))
1293				dpdy = get_unconst(ctx, dpdy);
1294		break;
1295	default:
1296		orig = &inst->Src[0].Register;
1297		samp = &inst->Src[1].Register;
1298		break;
1299	}
1300	if (tinf.args > 1 && is_rel_or_const(orig))
1301		orig = get_unconst(ctx, orig);
1302
1303	/* scale up integer coords for TXF based on the LOD */
1304	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
1305		struct tgsi_dst_register tmp_dst;
1306		struct tgsi_src_register *tmp_src;
1307		type_t type_mov = get_utype(ctx);
1308
1309		tmp_src = get_internal_temp(ctx, &tmp_dst);
1310		for (i = 0; i < tgt->dims; i++) {
1311			instr = instr_create(ctx, 2, OPC_SHL_B);
1312			add_dst_reg(ctx, instr, &tmp_dst, i);
1313			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1314			add_src_reg(ctx, instr, orig, orig->SwizzleW);
1315		}
1316		if (tgt->dims < 2) {
1317			instr = instr_create(ctx, 1, 0);
1318			instr->cat1.src_type = type_mov;
1319			instr->cat1.dst_type = type_mov;
1320			add_dst_reg(ctx, instr, &tmp_dst, i);
1321			add_src_reg(ctx, instr, &zero, 0);
1322			i++;
1323		}
1324		if (tgt->array) {
1325			instr = instr_create(ctx, 1, 0);
1326			instr->cat1.src_type = type_mov;
1327			instr->cat1.dst_type = type_mov;
1328			add_dst_reg(ctx, instr, &tmp_dst, i);
1329			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1330		}
1331		coord = tmp_src;
1332	}
1333
1334	if (inst->Texture.NumOffsets) {
1335		struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
1336		struct tgsi_src_register offset_src = {0};
1337
1338		offset_src.File = tex_offset->File;
1339		offset_src.Index = tex_offset->Index;
1340		offset_src.SwizzleX = tex_offset->SwizzleX;
1341		offset_src.SwizzleY = tex_offset->SwizzleY;
1342		offset_src.SwizzleZ = tex_offset->SwizzleZ;
1343		offset = get_unconst(ctx, &offset_src);
1344		tinf.flags |= IR3_INSTR_O;
1345	}
1346
1347	instr = instr_create(ctx, 5, t->opc);
1348	instr->cat5.type = get_ftype(ctx);
1349	instr->cat5.samp = samp->Index;
1350	instr->cat5.tex  = samp->Index;
1351	instr->flags |= tinf.flags;
1352
1353	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1354
1355	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1356
1357	collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1358	ir3_reg_create(collect, 0, 0);
1359	for (i = 0; i < 4; i++)
1360		if (tinf.src_wrmask & (1 << i))
1361			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1362					coord, src_swiz(coord, i));
1363		else if (tinf.src_wrmask & ~((1 << i) - 1))
1364			ir3_reg_create(collect, 0, 0);
1365
1366	/* Attach derivatives onto the end of the fan-in. Derivatives start after
1367	 * the 4th argument, so make sure that fi is padded up to 4 first.
1368	 */
1369	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
1370		while (collect->regs_count < 5)
1371			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1372		for (i = 0; i < tgt->dims; i++)
1373			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
1374		if (tgt->dims < 2)
1375			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1376		for (i = 0; i < tgt->dims; i++)
1377			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
1378		if (tgt->dims < 2)
1379			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1380		tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
1381	}
1382
1383	reg->instr = collect;
1384	reg->wrmask = tinf.src_wrmask;
1385
1386	/* The second argument contains the offsets, followed by the lod/bias
1387	 * argument. This is constructed more manually due to the dynamic nature.
1388	 */
1389	if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
1390		return;
1391
1392	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1393
1394	collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1395	ir3_reg_create(collect, 0, 0);
1396
1397	if (inst->Texture.NumOffsets) {
1398		for (i = 0; i < tgt->dims; i++)
1399			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1400					offset, i);
1401		if (tgt->dims < 2)
1402			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1403	}
1404	if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
1405		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1406				orig, orig->SwizzleX);
1407	else if (tinf.args > 1)
1408		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1409				orig, orig->SwizzleW);
1410
1411	reg->instr = collect;
1412	reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
1413}
1414
1415static void
1416trans_txq(const struct instr_translater *t,
1417		struct ir3_compile_context *ctx,
1418		struct tgsi_full_instruction *inst)
1419{
1420	struct ir3_instruction *instr;
1421	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1422	struct tgsi_src_register *level = &inst->Src[0].Register;
1423	struct tgsi_src_register *samp = &inst->Src[1].Register;
1424	struct tex_info tinf;
1425
1426	memset(&tinf, 0, sizeof(tinf));
1427	fill_tex_info(ctx, inst, &tinf);
1428	if (is_rel_or_const(level))
1429		level = get_unconst(ctx, level);
1430
1431	instr = instr_create(ctx, 5, OPC_GETSIZE);
1432	instr->cat5.type = get_utype(ctx);
1433	instr->cat5.samp = samp->Index;
1434	instr->cat5.tex  = samp->Index;
1435	instr->flags |= tinf.flags;
1436
1437	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1438	add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
1439}
1440
1441/* DDX/DDY */
1442static void
1443trans_deriv(const struct instr_translater *t,
1444		struct ir3_compile_context *ctx,
1445		struct tgsi_full_instruction *inst)
1446{
1447	struct ir3_instruction *instr;
1448	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1449	struct tgsi_src_register *src = &inst->Src[0].Register;
1450	static const int8_t order[4] = {0, 1, 2, 3};
1451
1452	if (!check_swiz(src, order)) {
1453		struct tgsi_dst_register tmp_dst;
1454		struct tgsi_src_register *tmp_src;
1455
1456		tmp_src = get_internal_temp(ctx, &tmp_dst);
1457		create_mov(ctx, &tmp_dst, src);
1458
1459		src = tmp_src;
1460	}
1461
1462	/* This might be a workaround for hw bug?  Blob compiler always
1463	 * seems to work two components at a time for dsy/dsx.  It does
1464	 * actually seem to work in some cases (or at least some piglit
1465	 * tests) for four components at a time.  But seems more reliable
1466	 * to split this into two instructions like the blob compiler
1467	 * does:
1468	 */
1469
1470	instr = instr_create(ctx, 5, t->opc);
1471	instr->cat5.type = get_ftype(ctx);
1472	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
1473	add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
1474
1475	instr = instr_create(ctx, 5, t->opc);
1476	instr->cat5.type = get_ftype(ctx);
1477	add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
1478	add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
1479}
1480
1481/*
1482 * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1483 *   cmps.f.eq tmp0, a, b
1484 *   cov.u16f16 dst, tmp0
1485 *
1486 * SNE(a,b) = (a != b) ? 1.0 : 0.0
1487 *   cmps.f.ne tmp0, a, b
1488 *   cov.u16f16 dst, tmp0
1489 *
1490 * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1491 *   cmps.f.ge tmp0, a, b
1492 *   cov.u16f16 dst, tmp0
1493 *
1494 * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1495 *   cmps.f.le tmp0, a, b
1496 *   cov.u16f16 dst, tmp0
1497 *
1498 * SGT(a,b) = (a > b)  ? 1.0 : 0.0
1499 *   cmps.f.gt tmp0, a, b
1500 *   cov.u16f16 dst, tmp0
1501 *
1502 * SLT(a,b) = (a < b)  ? 1.0 : 0.0
1503 *   cmps.f.lt tmp0, a, b
1504 *   cov.u16f16 dst, tmp0
1505 *
1506 * CMP(a,b,c) = (a < 0.0) ? b : c
1507 *   cmps.f.lt tmp0, a, {0.0}
1508 *   sel.b16 dst, b, tmp0, c
1509 */
1510static void
1511trans_cmp(const struct instr_translater *t,
1512		struct ir3_compile_context *ctx,
1513		struct tgsi_full_instruction *inst)
1514{
1515	struct ir3_instruction *instr;
1516	struct tgsi_dst_register tmp_dst;
1517	struct tgsi_src_register *tmp_src;
1518	struct tgsi_src_register constval0;
1519	/* final instruction for CMP() uses orig src1 and src2: */
1520	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1521	struct tgsi_src_register *a0, *a1, *a2;
1522	unsigned condition;
1523
1524	tmp_src = get_internal_temp(ctx, &tmp_dst);
1525
1526	a0 = &inst->Src[0].Register;  /* a */
1527	a1 = &inst->Src[1].Register;  /* b */
1528
1529	switch (t->tgsi_opc) {
1530	case TGSI_OPCODE_SEQ:
1531	case TGSI_OPCODE_FSEQ:
1532		condition = IR3_COND_EQ;
1533		break;
1534	case TGSI_OPCODE_SNE:
1535	case TGSI_OPCODE_FSNE:
1536		condition = IR3_COND_NE;
1537		break;
1538	case TGSI_OPCODE_SGE:
1539	case TGSI_OPCODE_FSGE:
1540		condition = IR3_COND_GE;
1541		break;
1542	case TGSI_OPCODE_SLT:
1543	case TGSI_OPCODE_FSLT:
1544		condition = IR3_COND_LT;
1545		break;
1546	case TGSI_OPCODE_SLE:
1547		condition = IR3_COND_LE;
1548		break;
1549	case TGSI_OPCODE_SGT:
1550		condition = IR3_COND_GT;
1551		break;
1552	case TGSI_OPCODE_CMP:
1553		get_immediate(ctx, &constval0, fui(0.0));
1554		a0 = &inst->Src[0].Register;  /* a */
1555		a1 = &constval0;              /* {0.0} */
1556		condition = IR3_COND_LT;
1557		break;
1558	default:
1559		compile_assert(ctx, 0);
1560		return;
1561	}
1562
1563	if (is_const(a0) && is_const(a1))
1564		a0 = get_unconst(ctx, a0);
1565
1566	/* cmps.f.<cond> tmp, a0, a1 */
1567	instr = instr_create(ctx, 2, OPC_CMPS_F);
1568	instr->cat2.condition = condition;
1569	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1570
1571	switch (t->tgsi_opc) {
1572	case TGSI_OPCODE_SEQ:
1573	case TGSI_OPCODE_SGE:
1574	case TGSI_OPCODE_SLE:
1575	case TGSI_OPCODE_SNE:
1576	case TGSI_OPCODE_SGT:
1577	case TGSI_OPCODE_SLT:
1578		/* cov.u16f16 dst, tmp0 */
1579		instr = instr_create(ctx, 1, 0);
1580		instr->cat1.src_type = get_utype(ctx);
1581		instr->cat1.dst_type = get_ftype(ctx);
1582		vectorize(ctx, instr, dst, 1, tmp_src, 0);
1583		break;
1584	case TGSI_OPCODE_FSEQ:
1585	case TGSI_OPCODE_FSGE:
1586	case TGSI_OPCODE_FSNE:
1587	case TGSI_OPCODE_FSLT:
1588		/* absneg.s dst, (neg)tmp0 */
1589		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1590		vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1591		break;
1592	case TGSI_OPCODE_CMP:
1593		a1 = &inst->Src[1].Register;
1594		a2 = &inst->Src[2].Register;
1595		/* sel.{b32,b16} dst, src2, tmp, src1 */
1596		instr = instr_create(ctx, 3, OPC_SEL_B32);
1597		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1598
1599		break;
1600	}
1601
1602	put_dst(ctx, inst, dst);
1603}
1604
1605/*
1606 * USNE(a,b) = (a != b) ? ~0 : 0
1607 *   cmps.u32.ne dst, a, b
1608 *
1609 * USEQ(a,b) = (a == b) ? ~0 : 0
1610 *   cmps.u32.eq dst, a, b
1611 *
1612 * ISGE(a,b) = (a > b) ? ~0 : 0
1613 *   cmps.s32.ge dst, a, b
1614 *
1615 * USGE(a,b) = (a > b) ? ~0 : 0
1616 *   cmps.u32.ge dst, a, b
1617 *
1618 * ISLT(a,b) = (a < b) ? ~0 : 0
1619 *   cmps.s32.lt dst, a, b
1620 *
1621 * USLT(a,b) = (a < b) ? ~0 : 0
1622 *   cmps.u32.lt dst, a, b
1623 *
1624 */
1625static void
1626trans_icmp(const struct instr_translater *t,
1627		struct ir3_compile_context *ctx,
1628		struct tgsi_full_instruction *inst)
1629{
1630	struct ir3_instruction *instr;
1631	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1632	struct tgsi_dst_register tmp_dst;
1633	struct tgsi_src_register *tmp_src;
1634	struct tgsi_src_register *a0, *a1;
1635	unsigned condition;
1636
1637	a0 = &inst->Src[0].Register;  /* a */
1638	a1 = &inst->Src[1].Register;  /* b */
1639
1640	switch (t->tgsi_opc) {
1641	case TGSI_OPCODE_USNE:
1642		condition = IR3_COND_NE;
1643		break;
1644	case TGSI_OPCODE_USEQ:
1645		condition = IR3_COND_EQ;
1646		break;
1647	case TGSI_OPCODE_ISGE:
1648	case TGSI_OPCODE_USGE:
1649		condition = IR3_COND_GE;
1650		break;
1651	case TGSI_OPCODE_ISLT:
1652	case TGSI_OPCODE_USLT:
1653		condition = IR3_COND_LT;
1654		break;
1655
1656	default:
1657		compile_assert(ctx, 0);
1658		return;
1659	}
1660
1661	if (is_const(a0) && is_const(a1))
1662		a0 = get_unconst(ctx, a0);
1663
1664	tmp_src = get_internal_temp(ctx, &tmp_dst);
1665	/* cmps.{u32,s32}.<cond> tmp, a0, a1 */
1666	instr = instr_create(ctx, 2, t->opc);
1667	instr->cat2.condition = condition;
1668	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1669
1670	/* absneg.s dst, (neg)tmp */
1671	instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1672	vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1673
1674	put_dst(ctx, inst, dst);
1675}
1676
1677/*
1678 * UCMP(a,b,c) = a ? b : c
1679 *   sel.b16 dst, b, a, c
1680 */
1681static void
1682trans_ucmp(const struct instr_translater *t,
1683		struct ir3_compile_context *ctx,
1684		struct tgsi_full_instruction *inst)
1685{
1686	struct ir3_instruction *instr;
1687	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1688	struct tgsi_src_register *a0, *a1, *a2;
1689
1690	a0 = &inst->Src[0].Register;  /* a */
1691	a1 = &inst->Src[1].Register;  /* b */
1692	a2 = &inst->Src[2].Register;  /* c */
1693
1694	if (is_rel_or_const(a0))
1695		a0 = get_unconst(ctx, a0);
1696
1697	/* sel.{b32,b16} dst, b, a, c */
1698	instr = instr_create(ctx, 3, OPC_SEL_B32);
1699	vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
1700	put_dst(ctx, inst, dst);
1701}
1702
1703/*
1704 * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
1705 *   cmps.s.lt tmp_neg, a, 0  # 1 if a is negative
1706 *   cmps.s.gt tmp_pos, a, 0  # 1 if a is positive
1707 *   sub.u dst, tmp_pos, tmp_neg
1708 */
1709static void
1710trans_issg(const struct instr_translater *t,
1711		struct ir3_compile_context *ctx,
1712		struct tgsi_full_instruction *inst)
1713{
1714	struct ir3_instruction *instr;
1715	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1716	struct tgsi_src_register *a = &inst->Src[0].Register;
1717	struct tgsi_dst_register neg_dst, pos_dst;
1718	struct tgsi_src_register *neg_src, *pos_src;
1719
1720	neg_src = get_internal_temp(ctx, &neg_dst);
1721	pos_src = get_internal_temp(ctx, &pos_dst);
1722
1723	/* cmps.s.lt neg, a, 0 */
1724	instr = instr_create(ctx, 2, OPC_CMPS_S);
1725	instr->cat2.condition = IR3_COND_LT;
1726	vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
1727
1728	/* cmps.s.gt pos, a, 0 */
1729	instr = instr_create(ctx, 2, OPC_CMPS_S);
1730	instr->cat2.condition = IR3_COND_GT;
1731	vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
1732
1733	/* sub.u dst, pos, neg */
1734	instr = instr_create(ctx, 2, OPC_SUB_U);
1735	vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
1736
1737	put_dst(ctx, inst, dst);
1738}
1739
1740
1741
1742/*
1743 * Conditional / Flow control
1744 */
1745
1746static void
1747push_branch(struct ir3_compile_context *ctx, bool inv,
1748		struct ir3_instruction *instr, struct ir3_instruction *cond)
1749{
1750	unsigned int idx = ctx->branch_count++;
1751	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
1752	ctx->branch[idx].instr = instr;
1753	ctx->branch[idx].inv = inv;
1754	/* else side of branch has same condition: */
1755	if (!inv)
1756		ctx->branch[idx].cond = cond;
1757}
1758
1759static struct ir3_instruction *
1760pop_branch(struct ir3_compile_context *ctx)
1761{
1762	unsigned int idx = --ctx->branch_count;
1763	return ctx->branch[idx].instr;
1764}
1765
1766static void
1767trans_if(const struct instr_translater *t,
1768		struct ir3_compile_context *ctx,
1769		struct tgsi_full_instruction *inst)
1770{
1771	struct ir3_instruction *instr, *cond;
1772	struct tgsi_src_register *src = &inst->Src[0].Register;
1773	struct tgsi_dst_register tmp_dst;
1774	struct tgsi_src_register *tmp_src;
1775	struct tgsi_src_register constval;
1776
1777	get_immediate(ctx, &constval, fui(0.0));
1778	tmp_src = get_internal_temp(ctx, &tmp_dst);
1779
1780	if (is_const(src))
1781		src = get_unconst(ctx, src);
1782
1783	/* cmps.{f,u}.ne tmp0, b, {0.0} */
1784	instr = instr_create(ctx, 2, t->opc);
1785	add_dst_reg(ctx, instr, &tmp_dst, 0);
1786	add_src_reg(ctx, instr, src, src->SwizzleX);
1787	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
1788	instr->cat2.condition = IR3_COND_NE;
1789
1790	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
1791	cond = instr->regs[1]->instr;
1792
1793	/* meta:flow tmp0 */
1794	instr = instr_create(ctx, -1, OPC_META_FLOW);
1795	ir3_reg_create(instr, 0, 0);  /* dummy dst */
1796	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1797
1798	push_branch(ctx, false, instr, cond);
1799	instr->flow.if_block = push_block(ctx);
1800}
1801
1802static void
1803trans_else(const struct instr_translater *t,
1804		struct ir3_compile_context *ctx,
1805		struct tgsi_full_instruction *inst)
1806{
1807	struct ir3_instruction *instr;
1808
1809	pop_block(ctx);
1810
1811	instr = pop_branch(ctx);
1812
1813	compile_assert(ctx, (instr->category == -1) &&
1814			(instr->opc == OPC_META_FLOW));
1815
1816	push_branch(ctx, true, instr, NULL);
1817	instr->flow.else_block = push_block(ctx);
1818}
1819
1820static struct ir3_instruction *
1821find_temporary(struct ir3_block *block, unsigned n)
1822{
1823	if (block->parent && !block->temporaries[n])
1824		return find_temporary(block->parent, n);
1825	return block->temporaries[n];
1826}
1827
1828static struct ir3_instruction *
1829find_output(struct ir3_block *block, unsigned n)
1830{
1831	if (block->parent && !block->outputs[n])
1832		return find_output(block->parent, n);
1833	return block->outputs[n];
1834}
1835
1836static struct ir3_instruction *
1837create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
1838		struct ir3_instruction *a, struct ir3_instruction *b)
1839{
1840	struct ir3_instruction *phi;
1841
1842	compile_assert(ctx, cond);
1843
1844	/* Either side of the condition could be null..  which
1845	 * indicates a variable written on only one side of the
1846	 * branch.  Normally this should only be variables not
1847	 * used outside of that side of the branch.  So we could
1848	 * just 'return a ? a : b;' in that case.  But for better
1849	 * defined undefined behavior we just stick in imm{0.0}.
1850	 * In the common case of a value only used within the
1851	 * one side of the branch, the PHI instruction will not
1852	 * get scheduled
1853	 */
1854	if (!a)
1855		a = create_immed(ctx, 0.0);
1856	if (!b)
1857		b = create_immed(ctx, 0.0);
1858
1859	phi = instr_create(ctx, -1, OPC_META_PHI);
1860	ir3_reg_create(phi, 0, 0);  /* dummy dst */
1861	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
1862	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
1863	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
1864
1865	return phi;
1866}
1867
1868static void
1869trans_endif(const struct instr_translater *t,
1870		struct ir3_compile_context *ctx,
1871		struct tgsi_full_instruction *inst)
1872{
1873	struct ir3_instruction *instr;
1874	struct ir3_block *ifb, *elseb;
1875	struct ir3_instruction **ifout, **elseout;
1876	unsigned i, ifnout = 0, elsenout = 0;
1877
1878	pop_block(ctx);
1879
1880	instr = pop_branch(ctx);
1881
1882	compile_assert(ctx, (instr->category == -1) &&
1883			(instr->opc == OPC_META_FLOW));
1884
1885	ifb = instr->flow.if_block;
1886	elseb = instr->flow.else_block;
1887	/* if there is no else block, the parent block is used for the
1888	 * branch-not-taken src of the PHI instructions:
1889	 */
1890	if (!elseb)
1891		elseb = ifb->parent;
1892
1893	/* worst case sizes: */
1894	ifnout = ifb->ntemporaries + ifb->noutputs;
1895	elsenout = elseb->ntemporaries + elseb->noutputs;
1896
1897	ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
1898	if (elseb != ifb->parent)
1899		elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
1900
1901	ifnout = 0;
1902	elsenout = 0;
1903
1904	/* generate PHI instructions for any temporaries written: */
1905	for (i = 0; i < ifb->ntemporaries; i++) {
1906		struct ir3_instruction *a = ifb->temporaries[i];
1907		struct ir3_instruction *b = elseb->temporaries[i];
1908
1909		/* if temporary written in if-block, or if else block
1910		 * is present and temporary written in else-block:
1911		 */
1912		if (a || ((elseb != ifb->parent) && b)) {
1913			struct ir3_instruction *phi;
1914
1915			/* if only written on one side, find the closest
1916			 * enclosing update on other side:
1917			 */
1918			if (!a)
1919				a = find_temporary(ifb, i);
1920			if (!b)
1921				b = find_temporary(elseb, i);
1922
1923			ifout[ifnout] = a;
1924			a = create_output(ifb, a, ifnout++);
1925
1926			if (elseb != ifb->parent) {
1927				elseout[elsenout] = b;
1928				b = create_output(elseb, b, elsenout++);
1929			}
1930
1931			phi = create_phi(ctx, instr, a, b);
1932			ctx->block->temporaries[i] = phi;
1933		}
1934	}
1935
1936	compile_assert(ctx, ifb->noutputs == elseb->noutputs);
1937
1938	/* .. and any outputs written: */
1939	for (i = 0; i < ifb->noutputs; i++) {
1940		struct ir3_instruction *a = ifb->outputs[i];
1941		struct ir3_instruction *b = elseb->outputs[i];
1942
1943		/* if output written in if-block, or if else block
1944		 * is present and output written in else-block:
1945		 */
1946		if (a || ((elseb != ifb->parent) && b)) {
1947			struct ir3_instruction *phi;
1948
1949			/* if only written on one side, find the closest
1950			 * enclosing update on other side:
1951			 */
1952			if (!a)
1953				a = find_output(ifb, i);
1954			if (!b)
1955				b = find_output(elseb, i);
1956
1957			ifout[ifnout] = a;
1958			a = create_output(ifb, a, ifnout++);
1959
1960			if (elseb != ifb->parent) {
1961				elseout[elsenout] = b;
1962				b = create_output(elseb, b, elsenout++);
1963			}
1964
1965			phi = create_phi(ctx, instr, a, b);
1966			ctx->block->outputs[i] = phi;
1967		}
1968	}
1969
1970	ifb->noutputs = ifnout;
1971	ifb->outputs = ifout;
1972
1973	if (elseb != ifb->parent) {
1974		elseb->noutputs = elsenout;
1975		elseb->outputs = elseout;
1976	}
1977
1978	// TODO maybe we want to compact block->inputs?
1979}
1980
1981/*
1982 * Kill
1983 */
1984
1985static void
1986trans_kill(const struct instr_translater *t,
1987		struct ir3_compile_context *ctx,
1988		struct tgsi_full_instruction *inst)
1989{
1990	struct ir3_instruction *instr, *immed, *cond = NULL;
1991	bool inv = false;
1992
1993	switch (t->tgsi_opc) {
1994	case TGSI_OPCODE_KILL:
1995		/* unconditional kill, use enclosing if condition: */
1996		if (ctx->branch_count > 0) {
1997			unsigned int idx = ctx->branch_count - 1;
1998			cond = ctx->branch[idx].cond;
1999			inv = ctx->branch[idx].inv;
2000		} else {
2001			cond = create_immed(ctx, 1.0);
2002		}
2003
2004		break;
2005	}
2006
2007	compile_assert(ctx, cond);
2008
2009	immed = create_immed(ctx, 0.0);
2010
2011	/* cmps.f.ne p0.x, cond, {0.0} */
2012	instr = instr_create(ctx, 2, OPC_CMPS_F);
2013	instr->cat2.condition = IR3_COND_NE;
2014	ir3_reg_create(instr, regid(REG_P0, 0), 0);
2015	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2016	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2017	cond = instr;
2018
2019	/* kill p0.x */
2020	instr = instr_create(ctx, 0, OPC_KILL);
2021	instr->cat0.inv = inv;
2022	ir3_reg_create(instr, 0, 0);  /* dummy dst */
2023	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2024
2025	ctx->kill[ctx->kill_count++] = instr;
2026
2027	ctx->so->has_kill = true;
2028}
2029
2030/*
2031 * Kill-If
2032 */
2033
2034static void
2035trans_killif(const struct instr_translater *t,
2036		struct ir3_compile_context *ctx,
2037		struct tgsi_full_instruction *inst)
2038{
2039	struct tgsi_src_register *src = &inst->Src[0].Register;
2040	struct ir3_instruction *instr, *immed, *cond = NULL;
2041	bool inv = false;
2042
2043	immed = create_immed(ctx, 0.0);
2044
2045	/* cmps.f.ne p0.x, cond, {0.0} */
2046	instr = instr_create(ctx, 2, OPC_CMPS_F);
2047	instr->cat2.condition = IR3_COND_NE;
2048	ir3_reg_create(instr, regid(REG_P0, 0), 0);
2049	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2050	add_src_reg(ctx, instr, src, src->SwizzleX);
2051
2052	cond = instr;
2053
2054	/* kill p0.x */
2055	instr = instr_create(ctx, 0, OPC_KILL);
2056	instr->cat0.inv = inv;
2057	ir3_reg_create(instr, 0, 0);  /* dummy dst */
2058	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2059
2060	ctx->kill[ctx->kill_count++] = instr;
2061
2062	ctx->so->has_kill = true;
2063
2064}
2065/*
2066 * I2F / U2F / F2I / F2U
2067 */
2068
2069static void
2070trans_cov(const struct instr_translater *t,
2071		struct ir3_compile_context *ctx,
2072		struct tgsi_full_instruction *inst)
2073{
2074	struct ir3_instruction *instr;
2075	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2076	struct tgsi_src_register *src = &inst->Src[0].Register;
2077
2078	// cov.f32s32 dst, tmp0 /
2079	instr = instr_create(ctx, 1, 0);
2080	switch (t->tgsi_opc) {
2081	case TGSI_OPCODE_U2F:
2082		instr->cat1.src_type = TYPE_U32;
2083		instr->cat1.dst_type = TYPE_F32;
2084		break;
2085	case TGSI_OPCODE_I2F:
2086		instr->cat1.src_type = TYPE_S32;
2087		instr->cat1.dst_type = TYPE_F32;
2088		break;
2089	case TGSI_OPCODE_F2U:
2090		instr->cat1.src_type = TYPE_F32;
2091		instr->cat1.dst_type = TYPE_U32;
2092		break;
2093	case TGSI_OPCODE_F2I:
2094		instr->cat1.src_type = TYPE_F32;
2095		instr->cat1.dst_type = TYPE_S32;
2096		break;
2097
2098	}
2099	vectorize(ctx, instr, dst, 1, src, 0);
2100	put_dst(ctx, inst, dst);
2101}
2102
2103/*
2104 * UMUL / UMAD
2105 *
2106 * There is no 32-bit multiply instruction, so splitting a and b into high and
2107 * low components, we get that
2108 *
2109 * dst = al * bl + ah * bl << 16 + al * bh << 16
2110 *
2111 *  mull.u tmp0, a, b (mul low, i.e. al * bl)
2112 *  madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
2113 *  madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
2114 *
2115 * For UMAD, add in the extra argument after mull.u.
2116 */
2117static void
2118trans_umul(const struct instr_translater *t,
2119		struct ir3_compile_context *ctx,
2120		struct tgsi_full_instruction *inst)
2121{
2122	struct ir3_instruction *instr;
2123	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2124	struct tgsi_src_register *a = &inst->Src[0].Register;
2125	struct tgsi_src_register *b = &inst->Src[1].Register;
2126
2127	struct tgsi_dst_register tmp0_dst, tmp1_dst;
2128	struct tgsi_src_register *tmp0_src, *tmp1_src;
2129
2130	tmp0_src = get_internal_temp(ctx, &tmp0_dst);
2131	tmp1_src = get_internal_temp(ctx, &tmp1_dst);
2132
2133	if (is_rel_or_const(a))
2134		a = get_unconst(ctx, a);
2135	if (is_rel_or_const(b))
2136		b = get_unconst(ctx, b);
2137
2138	/* mull.u tmp0, a, b */
2139	instr = instr_create(ctx, 2, OPC_MULL_U);
2140	vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
2141
2142	if (t->tgsi_opc == TGSI_OPCODE_UMAD) {
2143		struct tgsi_src_register *c = &inst->Src[2].Register;
2144
2145		/* add.u tmp0, tmp0, c */
2146		instr = instr_create(ctx, 2, OPC_ADD_U);
2147		vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0);
2148	}
2149
2150	/* madsh.m16 tmp1, a, b, tmp0 */
2151	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2152	vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
2153
2154	/* madsh.m16 dst, b, a, tmp1 */
2155	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2156	vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
2157	put_dst(ctx, inst, dst);
2158}
2159
2160/*
2161 * IDIV / UDIV / MOD / UMOD
2162 *
2163 * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
2164 * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
2165 */
2166static void
2167trans_idiv(const struct instr_translater *t,
2168		struct ir3_compile_context *ctx,
2169		struct tgsi_full_instruction *inst)
2170{
2171	struct ir3_instruction *instr;
2172	struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
2173	struct tgsi_src_register *a = &inst->Src[0].Register;
2174	struct tgsi_src_register *b = &inst->Src[1].Register;
2175
2176	struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
2177	struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
2178
2179	struct tgsi_src_register negative_2, thirty_one;
2180	type_t src_type;
2181
2182	if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
2183		src_type = get_stype(ctx);
2184	else
2185		src_type = get_utype(ctx);
2186
2187	af_src = get_internal_temp(ctx, &af_dst);
2188	bf_src = get_internal_temp(ctx, &bf_dst);
2189	q_src = get_internal_temp(ctx, &q_dst);
2190	r_src = get_internal_temp(ctx, &r_dst);
2191	a_src = get_internal_temp(ctx, &a_dst);
2192	b_src = get_internal_temp(ctx, &b_dst);
2193
2194	get_immediate(ctx, &negative_2, -2);
2195	get_immediate(ctx, &thirty_one, 31);
2196
2197	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
2198		premod_dst = &q_dst;
2199
2200	/* cov.[us]32f32 af, numerator */
2201	instr = instr_create(ctx, 1, 0);
2202	instr->cat1.src_type = src_type;
2203	instr->cat1.dst_type = get_ftype(ctx);
2204	vectorize(ctx, instr, &af_dst, 1, a, 0);
2205
2206	/* cov.[us]32f32 bf, denominator */
2207	instr = instr_create(ctx, 1, 0);
2208	instr->cat1.src_type = src_type;
2209	instr->cat1.dst_type = get_ftype(ctx);
2210	vectorize(ctx, instr, &bf_dst, 1, b, 0);
2211
2212	/* Get the absolute values for IDIV */
2213	if (type_sint(src_type)) {
2214		/* absneg.f af, (abs)af */
2215		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2216		vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_ABS);
2217
2218		/* absneg.f bf, (abs)bf */
2219		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2220		vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_ABS);
2221
2222		/* absneg.s a, (abs)numerator */
2223		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2224		vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_ABS);
2225
2226		/* absneg.s b, (abs)denominator */
2227		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2228		vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_ABS);
2229	} else {
2230		/* mov.u32u32 a, numerator */
2231		instr = instr_create(ctx, 1, 0);
2232		instr->cat1.src_type = src_type;
2233		instr->cat1.dst_type = src_type;
2234		vectorize(ctx, instr, &a_dst, 1, a, 0);
2235
2236		/* mov.u32u32 b, denominator */
2237		instr = instr_create(ctx, 1, 0);
2238		instr->cat1.src_type = src_type;
2239		instr->cat1.dst_type = src_type;
2240		vectorize(ctx, instr, &b_dst, 1, b, 0);
2241	}
2242
2243	/* rcp.f bf, bf */
2244	instr = instr_create(ctx, 4, OPC_RCP);
2245	vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
2246
2247	/* That's right, subtract 2 as an integer from the float */
2248	/* add.u bf, bf, -2 */
2249	instr = instr_create(ctx, 2, OPC_ADD_U);
2250	vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
2251
2252	/* mul.f q, af, bf */
2253	instr = instr_create(ctx, 2, OPC_MUL_F);
2254	vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
2255
2256	/* cov.f32[us]32 q, q */
2257	instr = instr_create(ctx, 1, 0);
2258	instr->cat1.src_type = get_ftype(ctx);
2259	instr->cat1.dst_type = src_type;
2260	vectorize(ctx, instr, &q_dst, 1, q_src, 0);
2261
2262	/* integer multiply q by b */
2263	/* mull.u r, q, b */
2264	instr = instr_create(ctx, 2, OPC_MULL_U);
2265	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2266
2267	/* madsh.m16 r, q, b, r */
2268	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2269	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2270
2271	/* madsh.m16, r, b, q, r */
2272	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2273	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2274
2275	/* sub.u r, a, r */
2276	instr = instr_create(ctx, 2, OPC_SUB_U);
2277	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2278
2279	/* cov.u32f32, r, r */
2280	instr = instr_create(ctx, 1, 0);
2281	instr->cat1.src_type = get_utype(ctx);
2282	instr->cat1.dst_type = get_ftype(ctx);
2283	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2284
2285	/* mul.f r, r, bf */
2286	instr = instr_create(ctx, 2, OPC_MUL_F);
2287	vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
2288
2289	/* cov.f32u32 r, r */
2290	instr = instr_create(ctx, 1, 0);
2291	instr->cat1.src_type = get_ftype(ctx);
2292	instr->cat1.dst_type = get_utype(ctx);
2293	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2294
2295	/* add.u q, q, r */
2296	instr = instr_create(ctx, 2, OPC_ADD_U);
2297	vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2298
2299	/* mull.u r, q, b */
2300	instr = instr_create(ctx, 2, OPC_MULL_U);
2301	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2302
2303	/* madsh.m16 r, q, b, r */
2304	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2305	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2306
2307	/* madsh.m16 r, b, q, r */
2308	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2309	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2310
2311	/* sub.u r, a, r */
2312	instr = instr_create(ctx, 2, OPC_SUB_U);
2313	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2314
2315	/* cmps.u.ge r, r, b */
2316	instr = instr_create(ctx, 2, OPC_CMPS_U);
2317	instr->cat2.condition = IR3_COND_GE;
2318	vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
2319
2320	if (type_uint(src_type)) {
2321		/* add.u dst, q, r */
2322		instr = instr_create(ctx, 2, OPC_ADD_U);
2323		vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
2324	} else {
2325		/* add.u q, q, r */
2326		instr = instr_create(ctx, 2, OPC_ADD_U);
2327		vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2328
2329		/* negate result based on the original arguments */
2330		if (is_const(a) && is_const(b))
2331			a = get_unconst(ctx, a);
2332
2333		/* xor.b r, numerator, denominator */
2334		instr = instr_create(ctx, 2, OPC_XOR_B);
2335		vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
2336
2337		/* shr.b r, r, 31 */
2338		instr = instr_create(ctx, 2, OPC_SHR_B);
2339		vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
2340
2341		/* absneg.s b, (neg)q */
2342		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2343		vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_NEGATE);
2344
2345		/* sel.b dst, b, r, q */
2346		instr = instr_create(ctx, 3, OPC_SEL_B32);
2347		vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
2348	}
2349
2350	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
2351		/* The division result will have ended up in q. */
2352
2353		if (is_rel_or_const(b))
2354			b = get_unconst(ctx, b);
2355
2356		/* mull.u r, q, b */
2357		instr = instr_create(ctx, 2, OPC_MULL_U);
2358		vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
2359
2360		/* madsh.m16 r, q, b, r */
2361		instr = instr_create(ctx, 3, OPC_MADSH_M16);
2362		vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
2363
2364		/* madsh.m16 r, b, q, r */
2365		instr = instr_create(ctx, 3, OPC_MADSH_M16);
2366		vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
2367
2368		/* sub.u dst, a, r */
2369		instr = instr_create(ctx, 2, OPC_SUB_U);
2370		vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
2371	}
2372
2373	put_dst(ctx, inst, dst);
2374}
2375
2376/*
2377 * Handlers for TGSI instructions which do have 1:1 mapping to native
2378 * instructions:
2379 */
2380
2381static void
2382instr_cat0(const struct instr_translater *t,
2383		struct ir3_compile_context *ctx,
2384		struct tgsi_full_instruction *inst)
2385{
2386	instr_create(ctx, 0, t->opc);
2387}
2388
2389static void
2390instr_cat1(const struct instr_translater *t,
2391		struct ir3_compile_context *ctx,
2392		struct tgsi_full_instruction *inst)
2393{
2394	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2395	struct tgsi_src_register *src = &inst->Src[0].Register;
2396	create_mov(ctx, dst, src);
2397	put_dst(ctx, inst, dst);
2398}
2399
2400static void
2401instr_cat2(const struct instr_translater *t,
2402		struct ir3_compile_context *ctx,
2403		struct tgsi_full_instruction *inst)
2404{
2405	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2406	struct tgsi_src_register *src0 = &inst->Src[0].Register;
2407	struct tgsi_src_register *src1 = &inst->Src[1].Register;
2408	struct ir3_instruction *instr;
2409	unsigned src0_flags = 0, src1_flags = 0;
2410
2411	switch (t->tgsi_opc) {
2412	case TGSI_OPCODE_ABS:
2413	case TGSI_OPCODE_IABS:
2414		src0_flags = IR3_REG_ABS;
2415		break;
2416	case TGSI_OPCODE_INEG:
2417		src0_flags = IR3_REG_NEGATE;
2418		break;
2419	case TGSI_OPCODE_SUB:
2420		src1_flags = IR3_REG_NEGATE;
2421		break;
2422	}
2423
2424	switch (t->opc) {
2425	case OPC_ABSNEG_F:
2426	case OPC_ABSNEG_S:
2427	case OPC_CLZ_B:
2428	case OPC_CLZ_S:
2429	case OPC_SIGN_F:
2430	case OPC_FLOOR_F:
2431	case OPC_CEIL_F:
2432	case OPC_RNDNE_F:
2433	case OPC_RNDAZ_F:
2434	case OPC_TRUNC_F:
2435	case OPC_NOT_B:
2436	case OPC_BFREV_B:
2437	case OPC_SETRM:
2438	case OPC_CBITS_B:
2439		/* these only have one src reg */
2440		instr = instr_create(ctx, 2, t->opc);
2441		vectorize(ctx, instr, dst, 1, src0, src0_flags);
2442		break;
2443	default:
2444		if (is_const(src0) && is_const(src1))
2445			src0 = get_unconst(ctx, src0);
2446
2447		instr = instr_create(ctx, 2, t->opc);
2448		vectorize(ctx, instr, dst, 2, src0, src0_flags,
2449				src1, src1_flags);
2450		break;
2451	}
2452
2453	put_dst(ctx, inst, dst);
2454}
2455
2456static void
2457instr_cat3(const struct instr_translater *t,
2458		struct ir3_compile_context *ctx,
2459		struct tgsi_full_instruction *inst)
2460{
2461	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2462	struct tgsi_src_register *src0 = &inst->Src[0].Register;
2463	struct tgsi_src_register *src1 = &inst->Src[1].Register;
2464	struct ir3_instruction *instr;
2465
2466	/* in particular, can't handle const for src1 for cat3..
2467	 * for mad, we can swap first two src's if needed:
2468	 */
2469	if (is_rel_or_const(src1)) {
2470		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
2471			struct tgsi_src_register *tmp;
2472			tmp = src0;
2473			src0 = src1;
2474			src1 = tmp;
2475		} else {
2476			src1 = get_unconst(ctx, src1);
2477		}
2478	}
2479
2480	instr = instr_create(ctx, 3, t->opc);
2481	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
2482			&inst->Src[2].Register, 0);
2483	put_dst(ctx, inst, dst);
2484}
2485
2486static void
2487instr_cat4(const struct instr_translater *t,
2488		struct ir3_compile_context *ctx,
2489		struct tgsi_full_instruction *inst)
2490{
2491	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2492	struct tgsi_src_register *src = &inst->Src[0].Register;
2493	struct ir3_instruction *instr;
2494	unsigned i;
2495
2496	/* seems like blob compiler avoids const as src.. */
2497	if (is_const(src))
2498		src = get_unconst(ctx, src);
2499
2500	/* we need to replicate into each component: */
2501	for (i = 0; i < 4; i++) {
2502		if (dst->WriteMask & (1 << i)) {
2503			instr = instr_create(ctx, 4, t->opc);
2504			add_dst_reg(ctx, instr, dst, i);
2505			add_src_reg(ctx, instr, src, src->SwizzleX);
2506		}
2507	}
2508
2509	put_dst(ctx, inst, dst);
2510}
2511
2512static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
2513#define INSTR(n, f, ...) \
2514	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
2515
2516	INSTR(MOV,          instr_cat1),
2517	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
2518	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
2519	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
2520	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
2521	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
2522	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
2523	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
2524	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
2525	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
2526	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
2527	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
2528	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
2529	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
2530	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2531	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
2532	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
2533	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
2534	INSTR(UMUL,         trans_umul),
2535	INSTR(UMAD,         trans_umul),
2536	INSTR(UDIV,         trans_idiv),
2537	INSTR(IDIV,         trans_idiv),
2538	INSTR(MOD,          trans_idiv),
2539	INSTR(UMOD,         trans_idiv),
2540	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
2541	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
2542	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
2543	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
2544	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
2545	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2546	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
2547	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
2548	INSTR(CLAMP,        trans_clamp),
2549	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
2550	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
2551	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
2552	INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
2553	INSTR(ARL,          trans_arl),
2554	INSTR(UARL,         trans_arl),
2555	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
2556	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
2557	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
2558	INSTR(COS,          instr_cat4, .opc = OPC_COS),
2559	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
2560	INSTR(TEX,          trans_samp, .opc = OPC_SAM),
2561	INSTR(TXP,          trans_samp, .opc = OPC_SAM),
2562	INSTR(TXB,          trans_samp, .opc = OPC_SAMB),
2563	INSTR(TXB2,         trans_samp, .opc = OPC_SAMB),
2564	INSTR(TXL,          trans_samp, .opc = OPC_SAML),
2565	INSTR(TXD,          trans_samp, .opc = OPC_SAMGQ),
2566	INSTR(TXF,          trans_samp, .opc = OPC_ISAML),
2567	INSTR(TXQ,          trans_txq),
2568	INSTR(DDX,          trans_deriv, .opc = OPC_DSX),
2569	INSTR(DDY,          trans_deriv, .opc = OPC_DSY),
2570	INSTR(SGT,          trans_cmp),
2571	INSTR(SLT,          trans_cmp),
2572	INSTR(FSLT,         trans_cmp),
2573	INSTR(SGE,          trans_cmp),
2574	INSTR(FSGE,         trans_cmp),
2575	INSTR(SLE,          trans_cmp),
2576	INSTR(SNE,          trans_cmp),
2577	INSTR(FSNE,         trans_cmp),
2578	INSTR(SEQ,          trans_cmp),
2579	INSTR(FSEQ,         trans_cmp),
2580	INSTR(CMP,          trans_cmp),
2581	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
2582	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
2583	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
2584	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
2585	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
2586	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
2587	INSTR(UCMP,         trans_ucmp),
2588	INSTR(ISSG,         trans_issg),
2589	INSTR(IF,           trans_if,   .opc = OPC_CMPS_F),
2590	INSTR(UIF,          trans_if,   .opc = OPC_CMPS_U),
2591	INSTR(ELSE,         trans_else),
2592	INSTR(ENDIF,        trans_endif),
2593	INSTR(END,          instr_cat0, .opc = OPC_END),
2594	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
2595	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
2596	INSTR(I2F,          trans_cov),
2597	INSTR(U2F,          trans_cov),
2598	INSTR(F2I,          trans_cov),
2599	INSTR(F2U,          trans_cov),
2600};
2601
2602static ir3_semantic
2603decl_semantic(const struct tgsi_declaration_semantic *sem)
2604{
2605	return ir3_semantic_name(sem->Name, sem->Index);
2606}
2607
2608static struct ir3_instruction *
2609decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
2610		unsigned j, unsigned inloc)
2611{
2612	struct ir3_instruction *instr;
2613	struct ir3_register *src;
2614
2615	/* bary.f dst, #inloc, r0.x */
2616	instr = instr_create(ctx, 2, OPC_BARY_F);
2617	ir3_reg_create(instr, regid, 0);   /* dummy dst */
2618	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2619	src = ir3_reg_create(instr, 0, IR3_REG_SSA);
2620	src->wrmask = 0x3;
2621	src->instr = ctx->frag_pos;
2622
2623	return instr;
2624}
2625
2626/* TGSI_SEMANTIC_POSITION
2627 * """"""""""""""""""""""
2628 *
2629 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
2630 * fragment shader input contains the fragment's window position.  The X
2631 * component starts at zero and always increases from left to right.
2632 * The Y component starts at zero and always increases but Y=0 may either
2633 * indicate the top of the window or the bottom depending on the fragment
2634 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
2635 * The Z coordinate ranges from 0 to 1 to represent depth from the front
2636 * to the back of the Z buffer.  The W component contains the reciprocol
2637 * of the interpolated vertex position W component.
2638 */
2639static struct ir3_instruction *
2640decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
2641		unsigned j)
2642{
2643	struct ir3_instruction *instr, *src;
2644
2645	compile_assert(ctx, !ctx->frag_coord[j]);
2646
2647	ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
2648
2649
2650	switch (j) {
2651	case 0: /* .x */
2652	case 1: /* .y */
2653		/* for frag_coord, we get unsigned values.. we need
2654		 * to subtract (integer) 8 and divide by 16 (right-
2655		 * shift by 4) then convert to float:
2656		 */
2657
2658		/* add.s tmp, src, -8 */
2659		instr = instr_create(ctx, 2, OPC_ADD_S);
2660		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2661		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
2662		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
2663		src = instr;
2664
2665		/* shr.b tmp, tmp, 4 */
2666		instr = instr_create(ctx, 2, OPC_SHR_B);
2667		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2668		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2669		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
2670		src = instr;
2671
2672		/* mov.u32f32 dst, tmp */
2673		instr = instr_create(ctx, 1, 0);
2674		instr->cat1.src_type = TYPE_U32;
2675		instr->cat1.dst_type = TYPE_F32;
2676		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2677		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2678
2679		break;
2680	case 2: /* .z */
2681	case 3: /* .w */
2682		/* seems that we can use these as-is: */
2683		instr = ctx->frag_coord[j];
2684		break;
2685	default:
2686		compile_error(ctx, "invalid channel\n");
2687		instr = create_immed(ctx, 0.0);
2688		break;
2689	}
2690
2691	return instr;
2692}
2693
2694/* TGSI_SEMANTIC_FACE
2695 * """"""""""""""""""
2696 *
2697 * This label applies to fragment shader inputs only and indicates that
2698 * the register contains front/back-face information of the form (F, 0,
2699 * 0, 1).  The first component will be positive when the fragment belongs
2700 * to a front-facing polygon, and negative when the fragment belongs to a
2701 * back-facing polygon.
2702 */
2703static struct ir3_instruction *
2704decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
2705		unsigned j)
2706{
2707	struct ir3_instruction *instr, *src;
2708
2709	switch (j) {
2710	case 0: /* .x */
2711		compile_assert(ctx, !ctx->frag_face);
2712
2713		ctx->frag_face = create_input(ctx->block, NULL, 0);
2714
2715		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
2716		 * positive vs negative float.. and piglit further seems to
2717		 * expect -1.0 or 1.0:
2718		 *
2719		 *    mul.s tmp, hr0.x, 2
2720		 *    add.s tmp, tmp, 1
2721		 *    mov.s16f32, dst, tmp
2722		 *
2723		 */
2724
2725		instr = instr_create(ctx, 2, OPC_MUL_S);
2726		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2727		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
2728		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
2729		src = instr;
2730
2731		instr = instr_create(ctx, 2, OPC_ADD_S);
2732		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2733		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2734		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
2735		src = instr;
2736
2737		instr = instr_create(ctx, 1, 0); /* mov */
2738		instr->cat1.src_type = TYPE_S32;
2739		instr->cat1.dst_type = TYPE_F32;
2740		ir3_reg_create(instr, regid, 0);    /* dummy dst */
2741		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2742
2743		break;
2744	case 1: /* .y */
2745	case 2: /* .z */
2746		instr = create_immed(ctx, 0.0);
2747		break;
2748	case 3: /* .w */
2749		instr = create_immed(ctx, 1.0);
2750		break;
2751	default:
2752		compile_error(ctx, "invalid channel\n");
2753		instr = create_immed(ctx, 0.0);
2754		break;
2755	}
2756
2757	return instr;
2758}
2759
2760static void
2761decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2762{
2763	struct ir3_shader_variant *so = ctx->so;
2764	unsigned name = decl->Semantic.Name;
2765	unsigned i;
2766
2767	/* I don't think we should get frag shader input without
2768	 * semantic info?  Otherwise how do inputs get linked to
2769	 * vert outputs?
2770	 */
2771	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
2772			decl->Declaration.Semantic);
2773
2774	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2775		unsigned n = so->inputs_count++;
2776		unsigned r = regid(i, 0);
2777		unsigned ncomp, j;
2778
2779		/* we'll figure out the actual components used after scheduling */
2780		ncomp = 4;
2781
2782		DBG("decl in -> r%d", i);
2783
2784		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
2785
2786		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
2787		so->inputs[n].compmask = (1 << ncomp) - 1;
2788		so->inputs[n].regid = r;
2789		so->inputs[n].inloc = ctx->next_inloc;
2790		so->inputs[n].interpolate = decl->Interp.Interpolate;
2791
2792		for (j = 0; j < ncomp; j++) {
2793			struct ir3_instruction *instr = NULL;
2794
2795			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2796				/* for fragment shaders, POSITION and FACE are handled
2797				 * specially, not using normal varying / bary.f
2798				 */
2799				if (name == TGSI_SEMANTIC_POSITION) {
2800					so->inputs[n].bary = false;
2801					so->frag_coord = true;
2802					instr = decl_in_frag_coord(ctx, r + j, j);
2803				} else if (name == TGSI_SEMANTIC_FACE) {
2804					so->inputs[n].bary = false;
2805					so->frag_face = true;
2806					instr = decl_in_frag_face(ctx, r + j, j);
2807				} else {
2808					so->inputs[n].bary = true;
2809					instr = decl_in_frag_bary(ctx, r + j, j,
2810							so->inputs[n].inloc + j - 8);
2811				}
2812			} else {
2813				instr = create_input(ctx->block, NULL, (i * 4) + j);
2814			}
2815
2816			ctx->block->inputs[(i * 4) + j] = instr;
2817		}
2818
2819		if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
2820			ctx->next_inloc += ncomp;
2821			so->total_in += ncomp;
2822		}
2823	}
2824}
2825
2826static void
2827decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2828{
2829	struct ir3_shader_variant *so = ctx->so;
2830	unsigned comp = 0;
2831	unsigned name = decl->Semantic.Name;
2832	unsigned i;
2833
2834	compile_assert(ctx, decl->Declaration.Semantic);
2835
2836	DBG("decl out[%d] -> r%d", name, decl->Range.First);
2837
2838	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2839		switch (name) {
2840		case TGSI_SEMANTIC_POSITION:
2841			so->writes_pos = true;
2842			break;
2843		case TGSI_SEMANTIC_PSIZE:
2844			so->writes_psize = true;
2845			break;
2846		case TGSI_SEMANTIC_COLOR:
2847		case TGSI_SEMANTIC_BCOLOR:
2848		case TGSI_SEMANTIC_GENERIC:
2849		case TGSI_SEMANTIC_FOG:
2850		case TGSI_SEMANTIC_TEXCOORD:
2851			break;
2852		default:
2853			compile_error(ctx, "unknown VS semantic name: %s\n",
2854					tgsi_semantic_names[name]);
2855		}
2856	} else {
2857		switch (name) {
2858		case TGSI_SEMANTIC_POSITION:
2859			comp = 2;  /* tgsi will write to .z component */
2860			so->writes_pos = true;
2861			break;
2862		case TGSI_SEMANTIC_COLOR:
2863			break;
2864		default:
2865			compile_error(ctx, "unknown FS semantic name: %s\n",
2866					tgsi_semantic_names[name]);
2867		}
2868	}
2869
2870	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2871		unsigned n = so->outputs_count++;
2872		unsigned ncomp, j;
2873
2874		ncomp = 4;
2875
2876		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
2877
2878		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
2879		so->outputs[n].regid = regid(i, comp);
2880
2881		/* avoid undefined outputs, stick a dummy mov from imm{0.0},
2882		 * which if the output is actually assigned will be over-
2883		 * written
2884		 */
2885		for (j = 0; j < ncomp; j++)
2886			ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
2887	}
2888}
2889
2890/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
2891 * for a fragment shader are just bary.f instructions.  The *actual* inputs
2892 * from the hw perspective are the frag_pos and optionally frag_coord and
2893 * frag_face.
2894 */
2895static void
2896fixup_frag_inputs(struct ir3_compile_context *ctx)
2897{
2898	struct ir3_shader_variant *so = ctx->so;
2899	struct ir3_block *block = ctx->block;
2900	struct ir3_instruction **inputs;
2901	struct ir3_instruction *instr;
2902	int n, regid = 0;
2903
2904	block->ninputs = 0;
2905
2906	n  = 4;  /* always have frag_pos */
2907	n += COND(so->frag_face, 4);
2908	n += COND(so->frag_coord, 4);
2909
2910	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
2911
2912	if (so->frag_face) {
2913		/* this ultimately gets assigned to hr0.x so doesn't conflict
2914		 * with frag_coord/frag_pos..
2915		 */
2916		inputs[block->ninputs++] = ctx->frag_face;
2917		ctx->frag_face->regs[0]->num = 0;
2918
2919		/* remaining channels not used, but let's avoid confusing
2920		 * other parts that expect inputs to come in groups of vec4
2921		 */
2922		inputs[block->ninputs++] = NULL;
2923		inputs[block->ninputs++] = NULL;
2924		inputs[block->ninputs++] = NULL;
2925	}
2926
2927	/* since we don't know where to set the regid for frag_coord,
2928	 * we have to use r0.x for it.  But we don't want to *always*
2929	 * use r1.x for frag_pos as that could increase the register
2930	 * footprint on simple shaders:
2931	 */
2932	if (so->frag_coord) {
2933		ctx->frag_coord[0]->regs[0]->num = regid++;
2934		ctx->frag_coord[1]->regs[0]->num = regid++;
2935		ctx->frag_coord[2]->regs[0]->num = regid++;
2936		ctx->frag_coord[3]->regs[0]->num = regid++;
2937
2938		inputs[block->ninputs++] = ctx->frag_coord[0];
2939		inputs[block->ninputs++] = ctx->frag_coord[1];
2940		inputs[block->ninputs++] = ctx->frag_coord[2];
2941		inputs[block->ninputs++] = ctx->frag_coord[3];
2942	}
2943
2944	/* we always have frag_pos: */
2945	so->pos_regid = regid;
2946
2947	/* r0.x */
2948	instr = create_input(block, NULL, block->ninputs);
2949	instr->regs[0]->num = regid++;
2950	inputs[block->ninputs++] = instr;
2951	ctx->frag_pos->regs[1]->instr = instr;
2952
2953	/* r0.y */
2954	instr = create_input(block, NULL, block->ninputs);
2955	instr->regs[0]->num = regid++;
2956	inputs[block->ninputs++] = instr;
2957	ctx->frag_pos->regs[2]->instr = instr;
2958
2959	block->inputs = inputs;
2960}
2961
2962static void
2963compile_instructions(struct ir3_compile_context *ctx)
2964{
2965	push_block(ctx);
2966
2967	/* for fragment shader, we have a single input register (usually
2968	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
2969	 */
2970	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2971		struct ir3_instruction *instr;
2972		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
2973		ir3_reg_create(instr, 0, 0);
2974		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
2975		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
2976		ctx->frag_pos = instr;
2977	}
2978
2979	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
2980		tgsi_parse_token(&ctx->parser);
2981
2982		switch (ctx->parser.FullToken.Token.Type) {
2983		case TGSI_TOKEN_TYPE_DECLARATION: {
2984			struct tgsi_full_declaration *decl =
2985					&ctx->parser.FullToken.FullDeclaration;
2986			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
2987				decl_out(ctx, decl);
2988			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
2989				decl_in(ctx, decl);
2990			}
2991			break;
2992		}
2993		case TGSI_TOKEN_TYPE_IMMEDIATE: {
2994			/* TODO: if we know the immediate is small enough, and only
2995			 * used with instructions that can embed an immediate, we
2996			 * can skip this:
2997			 */
2998			struct tgsi_full_immediate *imm =
2999					&ctx->parser.FullToken.FullImmediate;
3000			unsigned n = ctx->so->immediates_count++;
3001			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
3002			memcpy(ctx->so->immediates[n].val, imm->u, 16);
3003			break;
3004		}
3005		case TGSI_TOKEN_TYPE_INSTRUCTION: {
3006			struct tgsi_full_instruction *inst =
3007					&ctx->parser.FullToken.FullInstruction;
3008			unsigned opc = inst->Instruction.Opcode;
3009			const struct instr_translater *t = &translaters[opc];
3010
3011			if (t->fxn) {
3012				t->fxn(t, ctx, inst);
3013				ctx->num_internal_temps = 0;
3014
3015				compile_assert(ctx, !ctx->using_tmp_dst);
3016			} else {
3017				compile_error(ctx, "unknown TGSI opc: %s\n",
3018						tgsi_get_opcode_name(opc));
3019			}
3020
3021			switch (inst->Instruction.Saturate) {
3022			case TGSI_SAT_ZERO_ONE:
3023				create_clamp_imm(ctx, &inst->Dst[0].Register,
3024						fui(0.0), fui(1.0));
3025				break;
3026			case TGSI_SAT_MINUS_PLUS_ONE:
3027				create_clamp_imm(ctx, &inst->Dst[0].Register,
3028						fui(-1.0), fui(1.0));
3029				break;
3030			}
3031
3032			instr_finish(ctx);
3033
3034			break;
3035		}
3036		default:
3037			break;
3038		}
3039	}
3040}
3041
3042static void
3043compile_dump(struct ir3_compile_context *ctx)
3044{
3045	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
3046	static unsigned n = 0;
3047	char fname[16];
3048	FILE *f;
3049	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
3050	f = fopen(fname, "w");
3051	if (!f)
3052		return;
3053	ir3_block_depth(ctx->block);
3054	ir3_dump(ctx->ir, name, ctx->block, f);
3055	fclose(f);
3056}
3057
3058int
3059ir3_compile_shader(struct ir3_shader_variant *so,
3060		const struct tgsi_token *tokens, struct ir3_shader_key key,
3061		bool cp)
3062{
3063	struct ir3_compile_context ctx;
3064	struct ir3_block *block;
3065	struct ir3_instruction **inputs;
3066	unsigned i, j, actual_in;
3067	int ret = 0, max_bary;
3068
3069	assert(!so->ir);
3070
3071	so->ir = ir3_create();
3072
3073	assert(so->ir);
3074
3075	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
3076		DBG("INIT failed!");
3077		ret = -1;
3078		goto out;
3079	}
3080
3081	compile_instructions(&ctx);
3082
3083	block = ctx.block;
3084	so->ir->block = block;
3085
3086	/* keep track of the inputs from TGSI perspective.. */
3087	inputs = block->inputs;
3088
3089	/* but fixup actual inputs for frag shader: */
3090	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
3091		fixup_frag_inputs(&ctx);
3092
3093	/* at this point, for binning pass, throw away unneeded outputs: */
3094	if (key.binning_pass) {
3095		for (i = 0, j = 0; i < so->outputs_count; i++) {
3096			unsigned name = sem2name(so->outputs[i].semantic);
3097			unsigned idx = sem2name(so->outputs[i].semantic);
3098
3099			/* throw away everything but first position/psize */
3100			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
3101					(name == TGSI_SEMANTIC_PSIZE))) {
3102				if (i != j) {
3103					so->outputs[j] = so->outputs[i];
3104					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
3105					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
3106					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
3107					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
3108				}
3109				j++;
3110			}
3111		}
3112		so->outputs_count = j;
3113		block->noutputs = j * 4;
3114	}
3115
3116	/* for rendering to alpha format, we only need the .w component,
3117	 * and we need it to be in the .x position:
3118	 */
3119	if (key.alpha) {
3120		for (i = 0, j = 0; i < so->outputs_count; i++) {
3121			unsigned name = sem2name(so->outputs[i].semantic);
3122
3123			/* move .w component to .x and discard others: */
3124			if (name == TGSI_SEMANTIC_COLOR) {
3125				block->outputs[(i*4)+0] = block->outputs[(i*4)+3];
3126				block->outputs[(i*4)+1] = NULL;
3127				block->outputs[(i*4)+2] = NULL;
3128				block->outputs[(i*4)+3] = NULL;
3129			}
3130		}
3131	}
3132
3133	/* at this point, we want the kill's in the outputs array too,
3134	 * so that they get scheduled (since they have no dst).. we've
3135	 * already ensured that the array is big enough in push_block():
3136	 */
3137	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
3138		for (i = 0; i < ctx.kill_count; i++)
3139			block->outputs[block->noutputs++] = ctx.kill[i];
3140	}
3141
3142	if (fd_mesa_debug & FD_DBG_OPTDUMP)
3143		compile_dump(&ctx);
3144
3145	ret = ir3_block_flatten(block);
3146	if (ret < 0) {
3147		DBG("FLATTEN failed!");
3148		goto out;
3149	}
3150	if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
3151		compile_dump(&ctx);
3152
3153	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3154		printf("BEFORE CP:\n");
3155		ir3_dump_instr_list(block->head);
3156	}
3157
3158	if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
3159		ir3_block_cp(block);
3160
3161	if (fd_mesa_debug & FD_DBG_OPTDUMP)
3162		compile_dump(&ctx);
3163
3164	ir3_block_depth(block);
3165
3166	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3167		printf("AFTER DEPTH:\n");
3168		ir3_dump_instr_list(block->head);
3169	}
3170
3171	ret = ir3_block_sched(block);
3172	if (ret) {
3173		DBG("SCHED failed!");
3174		goto out;
3175	}
3176
3177	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3178		printf("AFTER SCHED:\n");
3179		ir3_dump_instr_list(block->head);
3180	}
3181
3182	ret = ir3_block_ra(block, so->type, key.half_precision,
3183			so->frag_coord, so->frag_face, &so->has_samp, &max_bary);
3184	if (ret) {
3185		DBG("RA failed!");
3186		goto out;
3187	}
3188
3189	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3190		printf("AFTER RA:\n");
3191		ir3_dump_instr_list(block->head);
3192	}
3193
3194	/* fixup input/outputs: */
3195	for (i = 0; i < so->outputs_count; i++) {
3196		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
3197		/* preserve hack for depth output.. tgsi writes depth to .z,
3198		 * but what we give the hw is the scalar register:
3199		 */
3200		if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
3201			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
3202			so->outputs[i].regid += 2;
3203	}
3204	/* Note that some or all channels of an input may be unused: */
3205	actual_in = 0;
3206	for (i = 0; i < so->inputs_count; i++) {
3207		unsigned j, regid = ~0, compmask = 0;
3208		so->inputs[i].ncomp = 0;
3209		for (j = 0; j < 4; j++) {
3210			struct ir3_instruction *in = inputs[(i*4) + j];
3211			if (in) {
3212				compmask |= (1 << j);
3213				regid = in->regs[0]->num - j;
3214				actual_in++;
3215				so->inputs[i].ncomp++;
3216			}
3217		}
3218		so->inputs[i].regid = regid;
3219		so->inputs[i].compmask = compmask;
3220	}
3221
3222	/* fragment shader always gets full vec4's even if it doesn't
3223	 * fetch all components, but vertex shader we need to update
3224	 * with the actual number of components fetch, otherwise thing
3225	 * will hang due to mismaptch between VFD_DECODE's and
3226	 * TOTALATTRTOVS
3227	 */
3228	if (so->type == SHADER_VERTEX)
3229		so->total_in = actual_in;
3230	else
3231		so->total_in = align(max_bary + 1, 4);
3232
3233out:
3234	if (ret) {
3235		ir3_destroy(so->ir);
3236		so->ir = NULL;
3237	}
3238	compile_free(&ctx);
3239
3240	return ret;
3241}
3242