1/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3/*
4 * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 *    Rob Clark <robclark@freedesktop.org>
27 */
28
29#include <stdarg.h>
30
31#include "pipe/p_state.h"
32#include "util/u_string.h"
33#include "util/u_memory.h"
34#include "util/u_inlines.h"
35
36#include "freedreno_util.h"
37
38#include "ir3_compiler.h"
39#include "ir3_shader.h"
40#include "ir3_nir.h"
41
42#include "instr-a3xx.h"
43#include "ir3.h"
44
45
46struct ir3_compile {
47	struct ir3_compiler *compiler;
48
49	struct nir_shader *s;
50
51	struct ir3 *ir;
52	struct ir3_shader_variant *so;
53
54	struct ir3_block *block;      /* the current block */
55	struct ir3_block *in_block;   /* block created for shader inputs */
56
57	nir_function_impl *impl;
58
59	/* For fragment shaders, from the hw perspective the only
60	 * actual input is r0.xy position register passed to bary.f.
61	 * But TGSI doesn't know that, it still declares things as
62	 * IN[] registers.  So we do all the input tracking normally
63	 * and fix things up after compile_instructions()
64	 *
65	 * NOTE that frag_pos is the hardware position (possibly it
66	 * is actually an index or tag or some such.. it is *not*
67	 * values that can be directly used for gl_FragCoord..)
68	 */
69	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
70
71	/* For vertex shaders, keep track of the system values sources */
72	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
73
74	/* mapping from nir_register to defining instruction: */
75	struct hash_table *def_ht;
76
77	unsigned num_arrays;
78
79	/* a common pattern for indirect addressing is to request the
80	 * same address register multiple times.  To avoid generating
81	 * duplicate instruction sequences (which our backend does not
82	 * try to clean up, since that should be done as the NIR stage)
83	 * we cache the address value generated for a given src value:
84	 */
85	struct hash_table *addr_ht;
86
87	/* maps nir_block to ir3_block, mostly for the purposes of
88	 * figuring out the blocks successors
89	 */
90	struct hash_table *block_ht;
91
92	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
93	 * so we need to use ldlv.u32 to load the varying directly:
94	 */
95	bool flat_bypass;
96
97	/* on a3xx, we need to add one to # of array levels:
98	 */
99	bool levels_add_one;
100
101	/* on a3xx, we need to scale up integer coords for isaml based
102	 * on LoD:
103	 */
104	bool unminify_coords;
105
106	/* on a4xx, for array textures we need to add 0.5 to the array
107	 * index coordinate:
108	 */
109	bool array_index_add_half;
110
111	/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
112	unsigned astc_srgb;
113
114	unsigned max_texture_index;
115
116	/* set if we encounter something we can't handle yet, so we
117	 * can bail cleanly and fallback to TGSI compiler f/e
118	 */
119	bool error;
120};
121
122/* gpu pointer size in units of 32bit registers/slots */
123static unsigned pointer_size(struct ir3_compile *ctx)
124{
125	return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
126}
127
128static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
129static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
130
131
132static struct ir3_compile *
133compile_init(struct ir3_compiler *compiler,
134		struct ir3_shader_variant *so)
135{
136	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
137
138	if (compiler->gpu_id >= 400) {
139		/* need special handling for "flat" */
140		ctx->flat_bypass = true;
141		ctx->levels_add_one = false;
142		ctx->unminify_coords = false;
143		ctx->array_index_add_half = true;
144
145		if (so->type == SHADER_VERTEX)
146			ctx->astc_srgb = so->key.vastc_srgb;
147		else if (so->type == SHADER_FRAGMENT)
148			ctx->astc_srgb = so->key.fastc_srgb;
149
150	} else {
151		/* no special handling for "flat" */
152		ctx->flat_bypass = false;
153		ctx->levels_add_one = true;
154		ctx->unminify_coords = true;
155		ctx->array_index_add_half = false;
156	}
157
158	ctx->compiler = compiler;
159	ctx->ir = so->ir;
160	ctx->so = so;
161	ctx->def_ht = _mesa_hash_table_create(ctx,
162			_mesa_hash_pointer, _mesa_key_pointer_equal);
163	ctx->block_ht = _mesa_hash_table_create(ctx,
164			_mesa_hash_pointer, _mesa_key_pointer_equal);
165
166	/* TODO: maybe generate some sort of bitmask of what key
167	 * lowers vs what shader has (ie. no need to lower
168	 * texture clamp lowering if no texture sample instrs)..
169	 * although should be done further up the stack to avoid
170	 * creating duplicate variants..
171	 */
172
173	if (ir3_key_lowers_nir(&so->key)) {
174		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
175		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
176	} else {
177		/* fast-path for shader key that lowers nothing in NIR: */
178		ctx->s = so->shader->nir;
179	}
180
181	if (fd_mesa_debug & FD_DBG_DISASM) {
182		DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
183			so->shader->id, so->id, so->type,
184			so->key.binning_pass, so->key.color_two_side,
185			so->key.half_precision);
186		nir_print_shader(ctx->s, stdout);
187	}
188
189	so->num_uniforms = ctx->s->num_uniforms;
190	so->num_ubos = ctx->s->info->num_ubos;
191
192	/* Layout of constant registers, each section aligned to vec4.  Note
193	 * that pointer size (ubo, etc) changes depending on generation.
194	 *
195	 *    user consts
196	 *    UBO addresses
197	 *    if (vertex shader) {
198	 *        driver params (IR3_DP_*)
199	 *        if (stream_output.num_outputs > 0)
200	 *           stream-out addresses
201	 *    }
202	 *    immediates
203	 *
204	 * Immediates go last mostly because they are inserted in the CP pass
205	 * after the nir -> ir3 frontend.
206	 */
207	unsigned constoff = align(ctx->s->num_uniforms, 4);
208	unsigned ptrsz = pointer_size(ctx);
209
210	memset(&so->constbase, ~0, sizeof(so->constbase));
211
212	if (so->num_ubos > 0) {
213		so->constbase.ubo = constoff;
214		constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4;
215	}
216
217	if (so->type == SHADER_VERTEX) {
218		so->constbase.driver_param = constoff;
219		constoff += align(IR3_DP_COUNT, 4) / 4;
220
221		if ((compiler->gpu_id < 500) &&
222				so->shader->stream_output.num_outputs > 0) {
223			so->constbase.tfbo = constoff;
224			constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4;
225		}
226	}
227
228	so->constbase.immediate = constoff;
229
230	return ctx;
231}
232
233static void
234compile_error(struct ir3_compile *ctx, const char *format, ...)
235{
236	va_list ap;
237	va_start(ap, format);
238	_debug_vprintf(format, ap);
239	va_end(ap);
240	nir_print_shader(ctx->s, stdout);
241	ctx->error = true;
242	debug_assert(0);
243}
244
245#define compile_assert(ctx, cond) do { \
246		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
247	} while (0)
248
249static void
250compile_free(struct ir3_compile *ctx)
251{
252	ralloc_free(ctx);
253}
254
255static void
256declare_var(struct ir3_compile *ctx, nir_variable *var)
257{
258	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
259	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
260	arr->id = ++ctx->num_arrays;
261	arr->length = length;
262	arr->var = var;
263	list_addtail(&arr->node, &ctx->ir->array_list);
264}
265
266static struct ir3_array *
267get_var(struct ir3_compile *ctx, nir_variable *var)
268{
269	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
270		if (arr->var == var)
271			return arr;
272	}
273	compile_error(ctx, "bogus var: %s\n", var->name);
274	return NULL;
275}
276
277/* allocate a n element value array (to be populated by caller) and
278 * insert in def_ht
279 */
280static struct ir3_instruction **
281__get_dst(struct ir3_compile *ctx, void *key, unsigned n)
282{
283	struct ir3_instruction **value =
284		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
285	_mesa_hash_table_insert(ctx->def_ht, key, value);
286	return value;
287}
288
289static struct ir3_instruction **
290get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
291{
292	compile_assert(ctx, dst->is_ssa);
293	if (dst->is_ssa) {
294		return __get_dst(ctx, &dst->ssa, n);
295	} else {
296		return __get_dst(ctx, dst->reg.reg, n);
297	}
298}
299
300static struct ir3_instruction **
301get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n)
302{
303	return __get_dst(ctx, dst, n);
304}
305
306static struct ir3_instruction * const *
307get_src(struct ir3_compile *ctx, nir_src *src)
308{
309	struct hash_entry *entry;
310	compile_assert(ctx, src->is_ssa);
311	if (src->is_ssa) {
312		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
313	} else {
314		entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg);
315	}
316	compile_assert(ctx, entry);
317	return entry->data;
318}
319
320static struct ir3_instruction *
321create_immed(struct ir3_block *block, uint32_t val)
322{
323	struct ir3_instruction *mov;
324
325	mov = ir3_instr_create(block, OPC_MOV);
326	mov->cat1.src_type = TYPE_U32;
327	mov->cat1.dst_type = TYPE_U32;
328	ir3_reg_create(mov, 0, 0);
329	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
330
331	return mov;
332}
333
334static struct ir3_instruction *
335create_addr(struct ir3_block *block, struct ir3_instruction *src)
336{
337	struct ir3_instruction *instr, *immed;
338
339	/* TODO in at least some cases, the backend could probably be
340	 * made clever enough to propagate IR3_REG_HALF..
341	 */
342	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
343	instr->regs[0]->flags |= IR3_REG_HALF;
344
345	immed = create_immed(block, 2);
346	immed->regs[0]->flags |= IR3_REG_HALF;
347
348	instr = ir3_SHL_B(block, instr, 0, immed, 0);
349	instr->regs[0]->flags |= IR3_REG_HALF;
350	instr->regs[1]->flags |= IR3_REG_HALF;
351
352	instr = ir3_MOV(block, instr, TYPE_S16);
353	instr->regs[0]->num = regid(REG_A0, 0);
354	instr->regs[0]->flags |= IR3_REG_HALF;
355	instr->regs[1]->flags |= IR3_REG_HALF;
356
357	return instr;
358}
359
360/* caches addr values to avoid generating multiple cov/shl/mova
361 * sequences for each use of a given NIR level src as address
362 */
363static struct ir3_instruction *
364get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
365{
366	struct ir3_instruction *addr;
367
368	if (!ctx->addr_ht) {
369		ctx->addr_ht = _mesa_hash_table_create(ctx,
370				_mesa_hash_pointer, _mesa_key_pointer_equal);
371	} else {
372		struct hash_entry *entry;
373		entry = _mesa_hash_table_search(ctx->addr_ht, src);
374		if (entry)
375			return entry->data;
376	}
377
378	addr = create_addr(ctx->block, src);
379	_mesa_hash_table_insert(ctx->addr_ht, src, addr);
380
381	return addr;
382}
383
384static struct ir3_instruction *
385get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
386{
387	struct ir3_block *b = ctx->block;
388	struct ir3_instruction *cond;
389
390	/* NOTE: only cmps.*.* can write p0.x: */
391	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
392	cond->cat2.condition = IR3_COND_NE;
393
394	/* condition always goes in predicate register: */
395	cond->regs[0]->num = regid(REG_P0, 0);
396
397	return cond;
398}
399
400static struct ir3_instruction *
401create_uniform(struct ir3_compile *ctx, unsigned n)
402{
403	struct ir3_instruction *mov;
404
405	mov = ir3_instr_create(ctx->block, OPC_MOV);
406	/* TODO get types right? */
407	mov->cat1.src_type = TYPE_F32;
408	mov->cat1.dst_type = TYPE_F32;
409	ir3_reg_create(mov, 0, 0);
410	ir3_reg_create(mov, n, IR3_REG_CONST);
411
412	return mov;
413}
414
415static struct ir3_instruction *
416create_uniform_indirect(struct ir3_compile *ctx, int n,
417		struct ir3_instruction *address)
418{
419	struct ir3_instruction *mov;
420
421	mov = ir3_instr_create(ctx->block, OPC_MOV);
422	mov->cat1.src_type = TYPE_U32;
423	mov->cat1.dst_type = TYPE_U32;
424	ir3_reg_create(mov, 0, 0);
425	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
426
427	ir3_instr_set_address(mov, address);
428
429	return mov;
430}
431
432static struct ir3_instruction *
433create_collect(struct ir3_block *block, struct ir3_instruction **arr,
434		unsigned arrsz)
435{
436	struct ir3_instruction *collect;
437
438	if (arrsz == 0)
439		return NULL;
440
441	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
442	ir3_reg_create(collect, 0, 0);     /* dst */
443	for (unsigned i = 0; i < arrsz; i++)
444		ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
445
446	return collect;
447}
448
449static struct ir3_instruction *
450create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
451		struct ir3_instruction *address, struct ir3_instruction *collect)
452{
453	struct ir3_block *block = ctx->block;
454	struct ir3_instruction *mov;
455	struct ir3_register *src;
456
457	mov = ir3_instr_create(block, OPC_MOV);
458	mov->cat1.src_type = TYPE_U32;
459	mov->cat1.dst_type = TYPE_U32;
460	ir3_reg_create(mov, 0, 0);
461	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
462	src->instr = collect;
463	src->size  = arrsz;
464	src->array.offset = n;
465
466	ir3_instr_set_address(mov, address);
467
468	return mov;
469}
470
471/* relative (indirect) if address!=NULL */
472static struct ir3_instruction *
473create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
474		struct ir3_instruction *address)
475{
476	struct ir3_block *block = ctx->block;
477	struct ir3_instruction *mov;
478	struct ir3_register *src;
479
480	mov = ir3_instr_create(block, OPC_MOV);
481	mov->cat1.src_type = TYPE_U32;
482	mov->cat1.dst_type = TYPE_U32;
483	ir3_reg_create(mov, 0, 0);
484	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
485			COND(address, IR3_REG_RELATIV));
486	src->instr = arr->last_write;
487	src->size  = arr->length;
488	src->array.id = arr->id;
489	src->array.offset = n;
490
491	if (address)
492		ir3_instr_set_address(mov, address);
493
494	arr->last_access = mov;
495
496	return mov;
497}
498
499/* relative (indirect) if address!=NULL */
500static struct ir3_instruction *
501create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
502		struct ir3_instruction *src, struct ir3_instruction *address)
503{
504	struct ir3_block *block = ctx->block;
505	struct ir3_instruction *mov;
506	struct ir3_register *dst;
507
508	mov = ir3_instr_create(block, OPC_MOV);
509	mov->cat1.src_type = TYPE_U32;
510	mov->cat1.dst_type = TYPE_U32;
511	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
512			COND(address, IR3_REG_RELATIV));
513	dst->instr = arr->last_access;
514	dst->size  = arr->length;
515	dst->array.id = arr->id;
516	dst->array.offset = n;
517	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
518
519	ir3_instr_set_address(mov, address);
520
521	arr->last_write = arr->last_access = mov;
522
523	return mov;
524}
525
526static struct ir3_instruction *
527create_input(struct ir3_block *block, unsigned n)
528{
529	struct ir3_instruction *in;
530
531	in = ir3_instr_create(block, OPC_META_INPUT);
532	in->inout.block = block;
533	ir3_reg_create(in, n, 0);
534
535	return in;
536}
537
538static struct ir3_instruction *
539create_frag_input(struct ir3_compile *ctx, bool use_ldlv)
540{
541	struct ir3_block *block = ctx->block;
542	struct ir3_instruction *instr;
543	/* actual inloc is assigned and fixed up later: */
544	struct ir3_instruction *inloc = create_immed(block, 0);
545
546	if (use_ldlv) {
547		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
548		instr->cat6.type = TYPE_U32;
549		instr->cat6.iim_val = 1;
550	} else {
551		instr = ir3_BARY_F(block, inloc, 0, ctx->frag_pos, 0);
552		instr->regs[2]->wrmask = 0x3;
553	}
554
555	return instr;
556}
557
558static struct ir3_instruction *
559create_frag_coord(struct ir3_compile *ctx, unsigned comp)
560{
561	struct ir3_block *block = ctx->block;
562	struct ir3_instruction *instr;
563
564	compile_assert(ctx, !ctx->frag_coord[comp]);
565
566	ctx->frag_coord[comp] = create_input(ctx->block, 0);
567
568	switch (comp) {
569	case 0: /* .x */
570	case 1: /* .y */
571		/* for frag_coord, we get unsigned values.. we need
572		 * to subtract (integer) 8 and divide by 16 (right-
573		 * shift by 4) then convert to float:
574		 *
575		 *    sub.s tmp, src, 8
576		 *    shr.b tmp, tmp, 4
577		 *    mov.u32f32 dst, tmp
578		 *
579		 */
580		instr = ir3_SUB_S(block, ctx->frag_coord[comp], 0,
581				create_immed(block, 8), 0);
582		instr = ir3_SHR_B(block, instr, 0,
583				create_immed(block, 4), 0);
584		instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
585
586		return instr;
587	case 2: /* .z */
588	case 3: /* .w */
589	default:
590		/* seems that we can use these as-is: */
591		return ctx->frag_coord[comp];
592	}
593}
594
595static struct ir3_instruction *
596create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
597{
598	/* first four vec4 sysval's reserved for UBOs: */
599	/* NOTE: dp is in scalar, but there can be >4 dp components: */
600	unsigned n = ctx->so->constbase.driver_param;
601	unsigned r = regid(n + dp / 4, dp % 4);
602	return create_uniform(ctx, r);
603}
604
605/* helper for instructions that produce multiple consecutive scalar
606 * outputs which need to have a split/fanout meta instruction inserted
607 */
608static void
609split_dest(struct ir3_block *block, struct ir3_instruction **dst,
610		struct ir3_instruction *src, unsigned base, unsigned n)
611{
612	struct ir3_instruction *prev = NULL;
613	for (int i = 0, j = 0; i < n; i++) {
614		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
615		ir3_reg_create(split, 0, IR3_REG_SSA);
616		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
617		split->fo.off = i + base;
618
619		if (prev) {
620			split->cp.left = prev;
621			split->cp.left_cnt++;
622			prev->cp.right = split;
623			prev->cp.right_cnt++;
624		}
625		prev = split;
626
627		if (src->regs[0]->wrmask & (1 << (i + base)))
628			dst[j++] = split;
629	}
630}
631
632/*
633 * Adreno uses uint rather than having dedicated bool type,
634 * which (potentially) requires some conversion, in particular
635 * when using output of an bool instr to int input, or visa
636 * versa.
637 *
638 *         | Adreno  |  NIR  |
639 *  -------+---------+-------+-
640 *   true  |    1    |  ~0   |
641 *   false |    0    |   0   |
642 *
643 * To convert from an adreno bool (uint) to nir, use:
644 *
645 *    absneg.s dst, (neg)src
646 *
647 * To convert back in the other direction:
648 *
649 *    absneg.s dst, (abs)arc
650 *
651 * The CP step can clean up the absneg.s that cancel each other
652 * out, and with a slight bit of extra cleverness (to recognize
653 * the instructions which produce either a 0 or 1) can eliminate
654 * the absneg.s's completely when an instruction that wants
655 * 0/1 consumes the result.  For example, when a nir 'bcsel'
656 * consumes the result of 'feq'.  So we should be able to get by
657 * without a boolean resolve step, and without incuring any
658 * extra penalty in instruction count.
659 */
660
661/* NIR bool -> native (adreno): */
662static struct ir3_instruction *
663ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
664{
665	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
666}
667
668/* native (adreno) -> NIR bool: */
669static struct ir3_instruction *
670ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
671{
672	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
673}
674
675/*
676 * alu/sfu instructions:
677 */
678
679static void
680emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
681{
682	const nir_op_info *info = &nir_op_infos[alu->op];
683	struct ir3_instruction **dst, *src[info->num_inputs];
684	struct ir3_block *b = ctx->block;
685
686	dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1));
687
688	/* Vectors are special in that they have non-scalarized writemasks,
689	 * and just take the first swizzle channel for each argument in
690	 * order into each writemask channel.
691	 */
692	if ((alu->op == nir_op_vec2) ||
693			(alu->op == nir_op_vec3) ||
694			(alu->op == nir_op_vec4)) {
695
696		for (int i = 0; i < info->num_inputs; i++) {
697			nir_alu_src *asrc = &alu->src[i];
698
699			compile_assert(ctx, !asrc->abs);
700			compile_assert(ctx, !asrc->negate);
701
702			src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
703			if (!src[i])
704				src[i] = create_immed(ctx->block, 0);
705			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
706		}
707
708		return;
709	}
710
711	/* General case: We can just grab the one used channel per src. */
712	for (int i = 0; i < info->num_inputs; i++) {
713		unsigned chan = ffs(alu->dest.write_mask) - 1;
714		nir_alu_src *asrc = &alu->src[i];
715
716		compile_assert(ctx, !asrc->abs);
717		compile_assert(ctx, !asrc->negate);
718
719		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
720
721		compile_assert(ctx, src[i]);
722	}
723
724	switch (alu->op) {
725	case nir_op_f2i:
726		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32);
727		break;
728	case nir_op_f2u:
729		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32);
730		break;
731	case nir_op_i2f:
732		dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32);
733		break;
734	case nir_op_u2f:
735		dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
736		break;
737	case nir_op_imov:
738		dst[0] = ir3_MOV(b, src[0], TYPE_S32);
739		break;
740	case nir_op_fmov:
741		dst[0] = ir3_MOV(b, src[0], TYPE_F32);
742		break;
743	case nir_op_f2b:
744		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
745		dst[0]->cat2.condition = IR3_COND_NE;
746		dst[0] = ir3_n2b(b, dst[0]);
747		break;
748	case nir_op_b2f:
749		dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
750		break;
751	case nir_op_b2i:
752		dst[0] = ir3_b2n(b, src[0]);
753		break;
754	case nir_op_i2b:
755		dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
756		dst[0]->cat2.condition = IR3_COND_NE;
757		dst[0] = ir3_n2b(b, dst[0]);
758		break;
759
760	case nir_op_fneg:
761		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
762		break;
763	case nir_op_fabs:
764		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
765		break;
766	case nir_op_fmax:
767		dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
768		break;
769	case nir_op_fmin:
770		dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
771		break;
772	case nir_op_fmul:
773		dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
774		break;
775	case nir_op_fadd:
776		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
777		break;
778	case nir_op_fsub:
779		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
780		break;
781	case nir_op_ffma:
782		dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
783		break;
784	case nir_op_fddx:
785		dst[0] = ir3_DSX(b, src[0], 0);
786		dst[0]->cat5.type = TYPE_F32;
787		break;
788	case nir_op_fddy:
789		dst[0] = ir3_DSY(b, src[0], 0);
790		dst[0]->cat5.type = TYPE_F32;
791		break;
792		break;
793	case nir_op_flt:
794		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
795		dst[0]->cat2.condition = IR3_COND_LT;
796		dst[0] = ir3_n2b(b, dst[0]);
797		break;
798	case nir_op_fge:
799		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
800		dst[0]->cat2.condition = IR3_COND_GE;
801		dst[0] = ir3_n2b(b, dst[0]);
802		break;
803	case nir_op_feq:
804		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
805		dst[0]->cat2.condition = IR3_COND_EQ;
806		dst[0] = ir3_n2b(b, dst[0]);
807		break;
808	case nir_op_fne:
809		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
810		dst[0]->cat2.condition = IR3_COND_NE;
811		dst[0] = ir3_n2b(b, dst[0]);
812		break;
813	case nir_op_fceil:
814		dst[0] = ir3_CEIL_F(b, src[0], 0);
815		break;
816	case nir_op_ffloor:
817		dst[0] = ir3_FLOOR_F(b, src[0], 0);
818		break;
819	case nir_op_ftrunc:
820		dst[0] = ir3_TRUNC_F(b, src[0], 0);
821		break;
822	case nir_op_fround_even:
823		dst[0] = ir3_RNDNE_F(b, src[0], 0);
824		break;
825	case nir_op_fsign:
826		dst[0] = ir3_SIGN_F(b, src[0], 0);
827		break;
828
829	case nir_op_fsin:
830		dst[0] = ir3_SIN(b, src[0], 0);
831		break;
832	case nir_op_fcos:
833		dst[0] = ir3_COS(b, src[0], 0);
834		break;
835	case nir_op_frsq:
836		dst[0] = ir3_RSQ(b, src[0], 0);
837		break;
838	case nir_op_frcp:
839		dst[0] = ir3_RCP(b, src[0], 0);
840		break;
841	case nir_op_flog2:
842		dst[0] = ir3_LOG2(b, src[0], 0);
843		break;
844	case nir_op_fexp2:
845		dst[0] = ir3_EXP2(b, src[0], 0);
846		break;
847	case nir_op_fsqrt:
848		dst[0] = ir3_SQRT(b, src[0], 0);
849		break;
850
851	case nir_op_iabs:
852		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
853		break;
854	case nir_op_iadd:
855		dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
856		break;
857	case nir_op_iand:
858		dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
859		break;
860	case nir_op_imax:
861		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
862		break;
863	case nir_op_umax:
864		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
865		break;
866	case nir_op_imin:
867		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
868		break;
869	case nir_op_umin:
870		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
871		break;
872	case nir_op_imul:
873		/*
874		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
875		 *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
876		 *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
877		 *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
878		 */
879		dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
880					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
881						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
882		break;
883	case nir_op_ineg:
884		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
885		break;
886	case nir_op_inot:
887		dst[0] = ir3_NOT_B(b, src[0], 0);
888		break;
889	case nir_op_ior:
890		dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
891		break;
892	case nir_op_ishl:
893		dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
894		break;
895	case nir_op_ishr:
896		dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
897		break;
898	case nir_op_isign: {
899		/* maybe this would be sane to lower in nir.. */
900		struct ir3_instruction *neg, *pos;
901
902		neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
903		neg->cat2.condition = IR3_COND_LT;
904
905		pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
906		pos->cat2.condition = IR3_COND_GT;
907
908		dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
909
910		break;
911	}
912	case nir_op_isub:
913		dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
914		break;
915	case nir_op_ixor:
916		dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
917		break;
918	case nir_op_ushr:
919		dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
920		break;
921	case nir_op_ilt:
922		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
923		dst[0]->cat2.condition = IR3_COND_LT;
924		dst[0] = ir3_n2b(b, dst[0]);
925		break;
926	case nir_op_ige:
927		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
928		dst[0]->cat2.condition = IR3_COND_GE;
929		dst[0] = ir3_n2b(b, dst[0]);
930		break;
931	case nir_op_ieq:
932		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
933		dst[0]->cat2.condition = IR3_COND_EQ;
934		dst[0] = ir3_n2b(b, dst[0]);
935		break;
936	case nir_op_ine:
937		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
938		dst[0]->cat2.condition = IR3_COND_NE;
939		dst[0] = ir3_n2b(b, dst[0]);
940		break;
941	case nir_op_ult:
942		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
943		dst[0]->cat2.condition = IR3_COND_LT;
944		dst[0] = ir3_n2b(b, dst[0]);
945		break;
946	case nir_op_uge:
947		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
948		dst[0]->cat2.condition = IR3_COND_GE;
949		dst[0] = ir3_n2b(b, dst[0]);
950		break;
951
952	case nir_op_bcsel:
953		dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
954		break;
955
956	case nir_op_bit_count:
957		dst[0] = ir3_CBITS_B(b, src[0], 0);
958		break;
959	case nir_op_ifind_msb: {
960		struct ir3_instruction *cmp;
961		dst[0] = ir3_CLZ_S(b, src[0], 0);
962		cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
963		cmp->cat2.condition = IR3_COND_GE;
964		dst[0] = ir3_SEL_B32(b,
965				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
966				cmp, 0, dst[0], 0);
967		break;
968	}
969	case nir_op_ufind_msb:
970		dst[0] = ir3_CLZ_B(b, src[0], 0);
971		dst[0] = ir3_SEL_B32(b,
972				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
973				src[0], 0, dst[0], 0);
974		break;
975	case nir_op_find_lsb:
976		dst[0] = ir3_BFREV_B(b, src[0], 0);
977		dst[0] = ir3_CLZ_B(b, dst[0], 0);
978		break;
979	case nir_op_bitfield_reverse:
980		dst[0] = ir3_BFREV_B(b, src[0], 0);
981		break;
982
983	default:
984		compile_error(ctx, "Unhandled ALU op: %s\n",
985				nir_op_infos[alu->op].name);
986		break;
987	}
988}
989
990/* handles direct/indirect UBO reads: */
991static void
992emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
993		struct ir3_instruction **dst)
994{
995	struct ir3_block *b = ctx->block;
996	struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
997	nir_const_value *const_offset;
998	/* UBO addresses are the first driver params: */
999	unsigned ubo = regid(ctx->so->constbase.ubo, 0);
1000	const unsigned ptrsz = pointer_size(ctx);
1001
1002	int off = 0;
1003
1004	/* First src is ubo index, which could either be an immed or not: */
1005	src0 = get_src(ctx, &intr->src[0])[0];
1006	if (is_same_type_mov(src0) &&
1007			(src0->regs[1]->flags & IR3_REG_IMMED)) {
1008		base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
1009		base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
1010	} else {
1011		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
1012		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0));
1013	}
1014
1015	/* note: on 32bit gpu's base_hi is ignored and DCE'd */
1016	addr = base_lo;
1017
1018	const_offset = nir_src_as_const_value(intr->src[1]);
1019	if (const_offset) {
1020		off += const_offset->u32[0];
1021	} else {
1022		/* For load_ubo_indirect, second src is indirect offset: */
1023		src1 = get_src(ctx, &intr->src[1])[0];
1024
1025		/* and add offset to addr: */
1026		addr = ir3_ADD_S(b, addr, 0, src1, 0);
1027	}
1028
1029	/* if offset is to large to encode in the ldg, split it out: */
1030	if ((off + (intr->num_components * 4)) > 1024) {
1031		/* split out the minimal amount to improve the odds that
1032		 * cp can fit the immediate in the add.s instruction:
1033		 */
1034		unsigned off2 = off + (intr->num_components * 4) - 1024;
1035		addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1036		off -= off2;
1037	}
1038
1039	if (ptrsz == 2) {
1040		struct ir3_instruction *carry;
1041
1042		/* handle 32b rollover, ie:
1043		 *   if (addr < base_lo)
1044		 *      base_hi++
1045		 */
1046		carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
1047		carry->cat2.condition = IR3_COND_LT;
1048		base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
1049
1050		addr = create_collect(b, (struct ir3_instruction*[]){ addr, base_hi }, 2);
1051	}
1052
1053	for (int i = 0; i < intr->num_components; i++) {
1054		struct ir3_instruction *load =
1055				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
1056		load->cat6.type = TYPE_U32;
1057		load->cat6.src_offset = off + i * 4;     /* byte offset */
1058		dst[i] = load;
1059	}
1060}
1061
1062/* handles array reads: */
1063static void
1064emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
1065		struct ir3_instruction **dst)
1066{
1067	nir_deref_var *dvar = intr->variables[0];
1068	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
1069	struct ir3_array *arr = get_var(ctx, dvar->var);
1070
1071	compile_assert(ctx, dvar->deref.child &&
1072		(dvar->deref.child->deref_type == nir_deref_type_array));
1073
1074	switch (darr->deref_array_type) {
1075	case nir_deref_array_type_direct:
1076		/* direct access does not require anything special: */
1077		for (int i = 0; i < intr->num_components; i++) {
1078			unsigned n = darr->base_offset * 4 + i;
1079			compile_assert(ctx, n < arr->length);
1080			dst[i] = create_var_load(ctx, arr, n, NULL);
1081		}
1082		break;
1083	case nir_deref_array_type_indirect: {
1084		/* for indirect, we need to collect all the array elements: */
1085		struct ir3_instruction *addr =
1086				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
1087		for (int i = 0; i < intr->num_components; i++) {
1088			unsigned n = darr->base_offset * 4 + i;
1089			compile_assert(ctx, n < arr->length);
1090			dst[i] = create_var_load(ctx, arr, n, addr);
1091		}
1092		break;
1093	}
1094	default:
1095		compile_error(ctx, "Unhandled load deref type: %u\n",
1096				darr->deref_array_type);
1097		break;
1098	}
1099}
1100
1101/* handles array writes: */
1102static void
1103emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
1104{
1105	nir_deref_var *dvar = intr->variables[0];
1106	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
1107	struct ir3_array *arr = get_var(ctx, dvar->var);
1108	struct ir3_instruction *addr;
1109	struct ir3_instruction * const *src;
1110	unsigned wrmask = nir_intrinsic_write_mask(intr);
1111
1112	compile_assert(ctx, dvar->deref.child &&
1113		(dvar->deref.child->deref_type == nir_deref_type_array));
1114
1115	src = get_src(ctx, &intr->src[0]);
1116
1117	switch (darr->deref_array_type) {
1118	case nir_deref_array_type_direct:
1119		addr = NULL;
1120		break;
1121	case nir_deref_array_type_indirect:
1122		addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
1123		break;
1124	default:
1125		compile_error(ctx, "Unhandled store deref type: %u\n",
1126				darr->deref_array_type);
1127		return;
1128	}
1129
1130	for (int i = 0; i < intr->num_components; i++) {
1131		if (!(wrmask & (1 << i)))
1132			continue;
1133		unsigned n = darr->base_offset * 4 + i;
1134		compile_assert(ctx, n < arr->length);
1135		create_var_store(ctx, arr, n, src[i], addr);
1136	}
1137}
1138
1139static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
1140		struct ir3_instruction *instr)
1141{
1142	struct ir3_shader_variant *so = ctx->so;
1143	unsigned r = regid(so->inputs_count, 0);
1144	unsigned n = so->inputs_count++;
1145
1146	so->inputs[n].sysval = true;
1147	so->inputs[n].slot = slot;
1148	so->inputs[n].compmask = 1;
1149	so->inputs[n].regid = r;
1150	so->inputs[n].interpolate = INTERP_MODE_FLAT;
1151	so->total_in++;
1152
1153	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
1154	ctx->ir->inputs[r] = instr;
1155}
1156
1157static void
1158emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
1159{
1160	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
1161	struct ir3_instruction **dst;
1162	struct ir3_instruction * const *src;
1163	struct ir3_block *b = ctx->block;
1164	nir_const_value *const_offset;
1165	int idx;
1166
1167	if (info->has_dest) {
1168		dst = get_dst(ctx, &intr->dest, intr->num_components);
1169	} else {
1170		dst = NULL;
1171	}
1172
1173	switch (intr->intrinsic) {
1174	case nir_intrinsic_load_uniform:
1175		idx = nir_intrinsic_base(intr);
1176		const_offset = nir_src_as_const_value(intr->src[0]);
1177		if (const_offset) {
1178			idx += const_offset->u32[0];
1179			for (int i = 0; i < intr->num_components; i++) {
1180				unsigned n = idx * 4 + i;
1181				dst[i] = create_uniform(ctx, n);
1182			}
1183		} else {
1184			src = get_src(ctx, &intr->src[0]);
1185			for (int i = 0; i < intr->num_components; i++) {
1186				int n = idx * 4 + i;
1187				dst[i] = create_uniform_indirect(ctx, n,
1188						get_addr(ctx, src[0]));
1189			}
1190			/* NOTE: if relative addressing is used, we set
1191			 * constlen in the compiler (to worst-case value)
1192			 * since we don't know in the assembler what the max
1193			 * addr reg value can be:
1194			 */
1195			ctx->so->constlen = ctx->s->num_uniforms;
1196		}
1197		break;
1198	case nir_intrinsic_load_ubo:
1199		emit_intrinsic_load_ubo(ctx, intr, dst);
1200		break;
1201	case nir_intrinsic_load_input:
1202		idx = nir_intrinsic_base(intr);
1203		const_offset = nir_src_as_const_value(intr->src[0]);
1204		if (const_offset) {
1205			idx += const_offset->u32[0];
1206			for (int i = 0; i < intr->num_components; i++) {
1207				unsigned n = idx * 4 + i;
1208				dst[i] = ctx->ir->inputs[n];
1209			}
1210		} else {
1211			src = get_src(ctx, &intr->src[0]);
1212			struct ir3_instruction *collect =
1213					create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
1214			struct ir3_instruction *addr = get_addr(ctx, src[0]);
1215			for (int i = 0; i < intr->num_components; i++) {
1216				unsigned n = idx * 4 + i;
1217				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
1218						n, addr, collect);
1219			}
1220		}
1221		break;
1222	case nir_intrinsic_load_var:
1223		emit_intrinsic_load_var(ctx, intr, dst);
1224		break;
1225	case nir_intrinsic_store_var:
1226		emit_intrinsic_store_var(ctx, intr);
1227		break;
1228	case nir_intrinsic_store_output:
1229		idx = nir_intrinsic_base(intr);
1230		const_offset = nir_src_as_const_value(intr->src[1]);
1231		compile_assert(ctx, const_offset != NULL);
1232		idx += const_offset->u32[0];
1233
1234		src = get_src(ctx, &intr->src[0]);
1235		for (int i = 0; i < intr->num_components; i++) {
1236			unsigned n = idx * 4 + i;
1237			ctx->ir->outputs[n] = src[i];
1238		}
1239		break;
1240	case nir_intrinsic_load_base_vertex:
1241		if (!ctx->basevertex) {
1242			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
1243			add_sysval_input(ctx, SYSTEM_VALUE_BASE_VERTEX,
1244					ctx->basevertex);
1245		}
1246		dst[0] = ctx->basevertex;
1247		break;
1248	case nir_intrinsic_load_vertex_id_zero_base:
1249	case nir_intrinsic_load_vertex_id:
1250		if (!ctx->vertex_id) {
1251			gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
1252				SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
1253			ctx->vertex_id = create_input(b, 0);
1254			add_sysval_input(ctx, sv, ctx->vertex_id);
1255		}
1256		dst[0] = ctx->vertex_id;
1257		break;
1258	case nir_intrinsic_load_instance_id:
1259		if (!ctx->instance_id) {
1260			ctx->instance_id = create_input(b, 0);
1261			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
1262					ctx->instance_id);
1263		}
1264		dst[0] = ctx->instance_id;
1265		break;
1266	case nir_intrinsic_load_user_clip_plane:
1267		idx = nir_intrinsic_ucp_id(intr);
1268		for (int i = 0; i < intr->num_components; i++) {
1269			unsigned n = idx * 4 + i;
1270			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
1271		}
1272		break;
1273	case nir_intrinsic_load_front_face:
1274		if (!ctx->frag_face) {
1275			ctx->so->frag_face = true;
1276			ctx->frag_face = create_input(b, 0);
1277			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
1278		}
1279		/* for fragface, we always get -1 or 0, but that is inverse
1280		 * of what nir expects (where ~0 is true).  Unfortunately
1281		 * trying to widen from half to full in add.s seems to do a
1282		 * non-sign-extending widen (resulting in something that
1283		 * gets interpreted as float Inf??)
1284		 */
1285		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
1286		dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
1287		break;
1288	case nir_intrinsic_discard_if:
1289	case nir_intrinsic_discard: {
1290		struct ir3_instruction *cond, *kill;
1291
1292		if (intr->intrinsic == nir_intrinsic_discard_if) {
1293			/* conditional discard: */
1294			src = get_src(ctx, &intr->src[0]);
1295			cond = ir3_b2n(b, src[0]);
1296		} else {
1297			/* unconditional discard: */
1298			cond = create_immed(b, 1);
1299		}
1300
1301		/* NOTE: only cmps.*.* can write p0.x: */
1302		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
1303		cond->cat2.condition = IR3_COND_NE;
1304
1305		/* condition always goes in predicate register: */
1306		cond->regs[0]->num = regid(REG_P0, 0);
1307
1308		kill = ir3_KILL(b, cond, 0);
1309		array_insert(ctx->ir->predicates, kill);
1310
1311		array_insert(ctx->ir->keeps, kill);
1312		ctx->so->has_kill = true;
1313
1314		break;
1315	}
1316	default:
1317		compile_error(ctx, "Unhandled intrinsic type: %s\n",
1318				nir_intrinsic_infos[intr->intrinsic].name);
1319		break;
1320	}
1321}
1322
1323static void
1324emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
1325{
1326	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
1327			instr->def.num_components);
1328	for (int i = 0; i < instr->def.num_components; i++)
1329		dst[i] = create_immed(ctx->block, instr->value.u32[i]);
1330}
1331
1332static void
1333emit_undef(struct ir3_compile *ctx, nir_ssa_undef_instr *undef)
1334{
1335	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
1336			undef->def.num_components);
1337	/* backend doesn't want undefined instructions, so just plug
1338	 * in 0.0..
1339	 */
1340	for (int i = 0; i < undef->def.num_components; i++)
1341		dst[i] = create_immed(ctx->block, fui(0.0));
1342}
1343
1344/*
1345 * texture fetch/sample instructions:
1346 */
1347
1348static void
1349tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
1350{
1351	unsigned coords, flags = 0;
1352
1353	/* note: would use tex->coord_components.. except txs.. also,
1354	 * since array index goes after shadow ref, we don't want to
1355	 * count it:
1356	 */
1357	switch (tex->sampler_dim) {
1358	case GLSL_SAMPLER_DIM_1D:
1359	case GLSL_SAMPLER_DIM_BUF:
1360		coords = 1;
1361		break;
1362	case GLSL_SAMPLER_DIM_2D:
1363	case GLSL_SAMPLER_DIM_RECT:
1364	case GLSL_SAMPLER_DIM_EXTERNAL:
1365	case GLSL_SAMPLER_DIM_MS:
1366		coords = 2;
1367		break;
1368	case GLSL_SAMPLER_DIM_3D:
1369	case GLSL_SAMPLER_DIM_CUBE:
1370		coords = 3;
1371		flags |= IR3_INSTR_3D;
1372		break;
1373	default:
1374		unreachable("bad sampler_dim");
1375	}
1376
1377	if (tex->is_shadow && tex->op != nir_texop_lod)
1378		flags |= IR3_INSTR_S;
1379
1380	if (tex->is_array && tex->op != nir_texop_lod)
1381		flags |= IR3_INSTR_A;
1382
1383	*flagsp = flags;
1384	*coordsp = coords;
1385}
1386
1387static void
1388emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
1389{
1390	struct ir3_block *b = ctx->block;
1391	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
1392	struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
1393	struct ir3_instruction *lod, *compare, *proj;
1394	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
1395	unsigned i, coords, flags;
1396	unsigned nsrc0 = 0, nsrc1 = 0;
1397	type_t type;
1398	opc_t opc = 0;
1399
1400	coord = off = ddx = ddy = NULL;
1401	lod = proj = compare = NULL;
1402
1403	/* TODO: might just be one component for gathers? */
1404	dst = get_dst(ctx, &tex->dest, 4);
1405
1406	for (unsigned i = 0; i < tex->num_srcs; i++) {
1407		switch (tex->src[i].src_type) {
1408		case nir_tex_src_coord:
1409			coord = get_src(ctx, &tex->src[i].src);
1410			break;
1411		case nir_tex_src_bias:
1412			lod = get_src(ctx, &tex->src[i].src)[0];
1413			has_bias = true;
1414			break;
1415		case nir_tex_src_lod:
1416			lod = get_src(ctx, &tex->src[i].src)[0];
1417			has_lod = true;
1418			break;
1419		case nir_tex_src_comparator: /* shadow comparator */
1420			compare = get_src(ctx, &tex->src[i].src)[0];
1421			break;
1422		case nir_tex_src_projector:
1423			proj = get_src(ctx, &tex->src[i].src)[0];
1424			has_proj = true;
1425			break;
1426		case nir_tex_src_offset:
1427			off = get_src(ctx, &tex->src[i].src);
1428			has_off = true;
1429			break;
1430		case nir_tex_src_ddx:
1431			ddx = get_src(ctx, &tex->src[i].src);
1432			break;
1433		case nir_tex_src_ddy:
1434			ddy = get_src(ctx, &tex->src[i].src);
1435			break;
1436		default:
1437			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
1438					tex->src[i].src_type);
1439			return;
1440		}
1441	}
1442
1443	switch (tex->op) {
1444	case nir_texop_tex:      opc = OPC_SAM;      break;
1445	case nir_texop_txb:      opc = OPC_SAMB;     break;
1446	case nir_texop_txl:      opc = OPC_SAML;     break;
1447	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
1448	case nir_texop_txf:      opc = OPC_ISAML;    break;
1449	case nir_texop_lod:      opc = OPC_GETLOD;   break;
1450	case nir_texop_txf_ms:
1451	case nir_texop_txs:
1452	case nir_texop_tg4:
1453	case nir_texop_query_levels:
1454	case nir_texop_texture_samples:
1455	case nir_texop_samples_identical:
1456	case nir_texop_txf_ms_mcs:
1457		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
1458		return;
1459	}
1460
1461	tex_info(tex, &flags, &coords);
1462
1463	/*
1464	 * lay out the first argument in the proper order:
1465	 *  - actual coordinates first
1466	 *  - shadow reference
1467	 *  - array index
1468	 *  - projection w
1469	 *  - starting at offset 4, dpdx.xy, dpdy.xy
1470	 *
1471	 * bias/lod go into the second arg
1472	 */
1473
1474	/* insert tex coords: */
1475	for (i = 0; i < coords; i++)
1476		src0[i] = coord[i];
1477
1478	nsrc0 = i;
1479
1480	/* scale up integer coords for TXF based on the LOD */
1481	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
1482		assert(has_lod);
1483		for (i = 0; i < coords; i++)
1484			src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
1485	}
1486
1487	if (coords == 1) {
1488		/* hw doesn't do 1d, so we treat it as 2d with
1489		 * height of 1, and patch up the y coord.
1490		 * TODO: y coord should be (int)0 in some cases..
1491		 */
1492		src0[nsrc0++] = create_immed(b, fui(0.5));
1493	}
1494
1495	if (tex->is_shadow && tex->op != nir_texop_lod)
1496		src0[nsrc0++] = compare;
1497
1498	if (tex->is_array && tex->op != nir_texop_lod) {
1499		struct ir3_instruction *idx = coord[coords];
1500
1501		/* the array coord for cube arrays needs 0.5 added to it */
1502		if (ctx->array_index_add_half && (opc != OPC_ISAML))
1503			idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
1504
1505		src0[nsrc0++] = idx;
1506	}
1507
1508	if (has_proj) {
1509		src0[nsrc0++] = proj;
1510		flags |= IR3_INSTR_P;
1511	}
1512
1513	/* pad to 4, then ddx/ddy: */
1514	if (tex->op == nir_texop_txd) {
1515		while (nsrc0 < 4)
1516			src0[nsrc0++] = create_immed(b, fui(0.0));
1517		for (i = 0; i < coords; i++)
1518			src0[nsrc0++] = ddx[i];
1519		if (coords < 2)
1520			src0[nsrc0++] = create_immed(b, fui(0.0));
1521		for (i = 0; i < coords; i++)
1522			src0[nsrc0++] = ddy[i];
1523		if (coords < 2)
1524			src0[nsrc0++] = create_immed(b, fui(0.0));
1525	}
1526
1527	/*
1528	 * second argument (if applicable):
1529	 *  - offsets
1530	 *  - lod
1531	 *  - bias
1532	 */
1533	if (has_off | has_lod | has_bias) {
1534		if (has_off) {
1535			for (i = 0; i < coords; i++)
1536				src1[nsrc1++] = off[i];
1537			if (coords < 2)
1538				src1[nsrc1++] = create_immed(b, fui(0.0));
1539			flags |= IR3_INSTR_O;
1540		}
1541
1542		if (has_lod | has_bias)
1543			src1[nsrc1++] = lod;
1544	}
1545
1546	switch (tex->dest_type) {
1547	case nir_type_invalid:
1548	case nir_type_float:
1549		type = TYPE_F32;
1550		break;
1551	case nir_type_int:
1552		type = TYPE_S32;
1553		break;
1554	case nir_type_uint:
1555	case nir_type_bool:
1556		type = TYPE_U32;
1557		break;
1558	default:
1559		unreachable("bad dest_type");
1560	}
1561
1562	if (opc == OPC_GETLOD)
1563		type = TYPE_U32;
1564
1565	unsigned tex_idx = tex->texture_index;
1566
1567	ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
1568
1569	struct ir3_instruction *col0 = create_collect(b, src0, nsrc0);
1570	struct ir3_instruction *col1 = create_collect(b, src1, nsrc1);
1571
1572	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, flags,
1573			tex_idx, tex_idx, col0, col1);
1574
1575	if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
1576		/* only need first 3 components: */
1577		sam->regs[0]->wrmask = 0x7;
1578		split_dest(b, dst, sam, 0, 3);
1579
1580		/* we need to sample the alpha separately with a non-ASTC
1581		 * texture state:
1582		 */
1583		sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_W, flags,
1584				tex_idx, tex_idx, col0, col1);
1585
1586		array_insert(ctx->ir->astc_srgb, sam);
1587
1588		/* fixup .w component: */
1589		split_dest(b, &dst[3], sam, 3, 1);
1590	} else {
1591		/* normal (non-workaround) case: */
1592		split_dest(b, dst, sam, 0, 4);
1593	}
1594
1595	/* GETLOD returns results in 4.8 fixed point */
1596	if (opc == OPC_GETLOD) {
1597		struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
1598
1599		compile_assert(ctx, tex->dest_type == nir_type_float);
1600		for (i = 0; i < 2; i++) {
1601			dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
1602							   factor, 0);
1603		}
1604	}
1605}
1606
1607static void
1608emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
1609{
1610	struct ir3_block *b = ctx->block;
1611	struct ir3_instruction **dst, *sam;
1612
1613	dst = get_dst(ctx, &tex->dest, 1);
1614
1615	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0,
1616			tex->texture_index, tex->texture_index, NULL, NULL);
1617
1618	/* even though there is only one component, since it ends
1619	 * up in .z rather than .x, we need a split_dest()
1620	 */
1621	split_dest(b, dst, sam, 0, 3);
1622
1623	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
1624	 * the value in TEX_CONST_0 is zero-based.
1625	 */
1626	if (ctx->levels_add_one)
1627		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
1628}
1629
1630static void
1631emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
1632{
1633	struct ir3_block *b = ctx->block;
1634	struct ir3_instruction **dst, *sam;
1635	struct ir3_instruction *lod;
1636	unsigned flags, coords;
1637
1638	tex_info(tex, &flags, &coords);
1639
1640	/* Actually we want the number of dimensions, not coordinates. This
1641	 * distinction only matters for cubes.
1642	 */
1643	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
1644		coords = 2;
1645
1646	dst = get_dst(ctx, &tex->dest, 4);
1647
1648	compile_assert(ctx, tex->num_srcs == 1);
1649	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
1650
1651	lod = get_src(ctx, &tex->src[0].src)[0];
1652
1653	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
1654			tex->texture_index, tex->texture_index, lod, NULL);
1655
1656	split_dest(b, dst, sam, 0, 4);
1657
1658	/* Array size actually ends up in .w rather than .z. This doesn't
1659	 * matter for miplevel 0, but for higher mips the value in z is
1660	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1661	 * returned, which means that we have to add 1 to it for arrays.
1662	 */
1663	if (tex->is_array) {
1664		if (ctx->levels_add_one) {
1665			dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
1666		} else {
1667			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
1668		}
1669	}
1670}
1671
1672static void
1673emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
1674{
1675	struct ir3_instruction *phi, **dst;
1676
1677	/* NOTE: phi's should be lowered to scalar at this point */
1678	compile_assert(ctx, nphi->dest.ssa.num_components == 1);
1679
1680	dst = get_dst(ctx, &nphi->dest, 1);
1681
1682	phi = ir3_instr_create2(ctx->block, OPC_META_PHI,
1683			1 + exec_list_length(&nphi->srcs));
1684	ir3_reg_create(phi, 0, 0);         /* dst */
1685	phi->phi.nphi = nphi;
1686
1687	dst[0] = phi;
1688}
1689
1690/* phi instructions are left partially constructed.  We don't resolve
1691 * their srcs until the end of the block, since (eg. loops) one of
1692 * the phi's srcs might be defined after the phi due to back edges in
1693 * the CFG.
1694 */
1695static void
1696resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
1697{
1698	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1699		nir_phi_instr *nphi;
1700
1701		/* phi's only come at start of block: */
1702		if (instr->opc != OPC_META_PHI)
1703			break;
1704
1705		if (!instr->phi.nphi)
1706			break;
1707
1708		nphi = instr->phi.nphi;
1709		instr->phi.nphi = NULL;
1710
1711		foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
1712			struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
1713
1714			/* NOTE: src might not be in the same block as it comes from
1715			 * according to the phi.. but in the end the backend assumes
1716			 * it will be able to assign the same register to each (which
1717			 * only works if it is assigned in the src block), so insert
1718			 * an extra mov to make sure the phi src is assigned in the
1719			 * block it comes from:
1720			 */
1721			src = ir3_MOV(get_block(ctx, nsrc->pred), src, TYPE_U32);
1722
1723			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
1724		}
1725	}
1726}
1727
1728static void
1729emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
1730{
1731	switch (jump->type) {
1732	case nir_jump_break:
1733	case nir_jump_continue:
1734		/* I *think* we can simply just ignore this, and use the
1735		 * successor block link to figure out where we need to
1736		 * jump to for break/continue
1737		 */
1738		break;
1739	default:
1740		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
1741		break;
1742	}
1743}
1744
1745static void
1746emit_instr(struct ir3_compile *ctx, nir_instr *instr)
1747{
1748	switch (instr->type) {
1749	case nir_instr_type_alu:
1750		emit_alu(ctx, nir_instr_as_alu(instr));
1751		break;
1752	case nir_instr_type_intrinsic:
1753		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
1754		break;
1755	case nir_instr_type_load_const:
1756		emit_load_const(ctx, nir_instr_as_load_const(instr));
1757		break;
1758	case nir_instr_type_ssa_undef:
1759		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
1760		break;
1761	case nir_instr_type_tex: {
1762		nir_tex_instr *tex = nir_instr_as_tex(instr);
1763		/* couple tex instructions get special-cased:
1764		 */
1765		switch (tex->op) {
1766		case nir_texop_txs:
1767			emit_tex_txs(ctx, tex);
1768			break;
1769		case nir_texop_query_levels:
1770			emit_tex_query_levels(ctx, tex);
1771			break;
1772		default:
1773			emit_tex(ctx, tex);
1774			break;
1775		}
1776		break;
1777	}
1778	case nir_instr_type_phi:
1779		emit_phi(ctx, nir_instr_as_phi(instr));
1780		break;
1781	case nir_instr_type_jump:
1782		emit_jump(ctx, nir_instr_as_jump(instr));
1783		break;
1784	case nir_instr_type_call:
1785	case nir_instr_type_parallel_copy:
1786		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
1787		break;
1788	}
1789}
1790
1791static struct ir3_block *
1792get_block(struct ir3_compile *ctx, nir_block *nblock)
1793{
1794	struct ir3_block *block;
1795	struct hash_entry *entry;
1796	entry = _mesa_hash_table_search(ctx->block_ht, nblock);
1797	if (entry)
1798		return entry->data;
1799
1800	block = ir3_block_create(ctx->ir);
1801	block->nblock = nblock;
1802	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
1803
1804	return block;
1805}
1806
1807static void
1808emit_block(struct ir3_compile *ctx, nir_block *nblock)
1809{
1810	struct ir3_block *block = get_block(ctx, nblock);
1811
1812	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
1813		if (nblock->successors[i]) {
1814			block->successors[i] =
1815				get_block(ctx, nblock->successors[i]);
1816		}
1817	}
1818
1819	ctx->block = block;
1820	list_addtail(&block->node, &ctx->ir->block_list);
1821
1822	/* re-emit addr register in each block if needed: */
1823	_mesa_hash_table_destroy(ctx->addr_ht, NULL);
1824	ctx->addr_ht = NULL;
1825
1826	nir_foreach_instr(instr, nblock) {
1827		emit_instr(ctx, instr);
1828		if (ctx->error)
1829			return;
1830	}
1831}
1832
1833static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
1834
1835static void
1836emit_if(struct ir3_compile *ctx, nir_if *nif)
1837{
1838	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
1839
1840	ctx->block->condition =
1841		get_predicate(ctx, ir3_b2n(condition->block, condition));
1842
1843	emit_cf_list(ctx, &nif->then_list);
1844	emit_cf_list(ctx, &nif->else_list);
1845}
1846
1847static void
1848emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
1849{
1850	emit_cf_list(ctx, &nloop->body);
1851}
1852
1853static void
1854emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
1855{
1856	foreach_list_typed(nir_cf_node, node, node, list) {
1857		switch (node->type) {
1858		case nir_cf_node_block:
1859			emit_block(ctx, nir_cf_node_as_block(node));
1860			break;
1861		case nir_cf_node_if:
1862			emit_if(ctx, nir_cf_node_as_if(node));
1863			break;
1864		case nir_cf_node_loop:
1865			emit_loop(ctx, nir_cf_node_as_loop(node));
1866			break;
1867		case nir_cf_node_function:
1868			compile_error(ctx, "TODO\n");
1869			break;
1870		}
1871	}
1872}
1873
1874/* emit stream-out code.  At this point, the current block is the original
1875 * (nir) end block, and nir ensures that all flow control paths terminate
1876 * into the end block.  We re-purpose the original end block to generate
1877 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
1878 * block holding stream-out write instructions, followed by the new end
1879 * block:
1880 *
1881 *   blockOrigEnd {
1882 *      p0.x = (vtxcnt < maxvtxcnt)
1883 *      // succs: blockStreamOut, blockNewEnd
1884 *   }
1885 *   blockStreamOut {
1886 *      ... stream-out instructions ...
1887 *      // succs: blockNewEnd
1888 *   }
1889 *   blockNewEnd {
1890 *   }
1891 */
1892static void
1893emit_stream_out(struct ir3_compile *ctx)
1894{
1895	struct ir3_shader_variant *v = ctx->so;
1896	struct ir3 *ir = ctx->ir;
1897	struct pipe_stream_output_info *strmout =
1898			&ctx->so->shader->stream_output;
1899	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
1900	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
1901	struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
1902
1903	/* create vtxcnt input in input block at top of shader,
1904	 * so that it is seen as live over the entire duration
1905	 * of the shader:
1906	 */
1907	vtxcnt = create_input(ctx->in_block, 0);
1908	add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
1909
1910	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
1911
1912	/* at this point, we are at the original 'end' block,
1913	 * re-purpose this block to stream-out condition, then
1914	 * append stream-out block and new-end block
1915	 */
1916	orig_end_block = ctx->block;
1917
1918	stream_out_block = ir3_block_create(ir);
1919	list_addtail(&stream_out_block->node, &ir->block_list);
1920
1921	new_end_block = ir3_block_create(ir);
1922	list_addtail(&new_end_block->node, &ir->block_list);
1923
1924	orig_end_block->successors[0] = stream_out_block;
1925	orig_end_block->successors[1] = new_end_block;
1926	stream_out_block->successors[0] = new_end_block;
1927
1928	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
1929	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
1930	cond->regs[0]->num = regid(REG_P0, 0);
1931	cond->cat2.condition = IR3_COND_LT;
1932
1933	/* condition goes on previous block to the conditional,
1934	 * since it is used to pick which of the two successor
1935	 * paths to take:
1936	 */
1937	orig_end_block->condition = cond;
1938
1939	/* switch to stream_out_block to generate the stream-out
1940	 * instructions:
1941	 */
1942	ctx->block = stream_out_block;
1943
1944	/* Calculate base addresses based on vtxcnt.  Instructions
1945	 * generated for bases not used in following loop will be
1946	 * stripped out in the backend.
1947	 */
1948	for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
1949		unsigned stride = strmout->stride[i];
1950		struct ir3_instruction *base, *off;
1951
1952		base = create_uniform(ctx, regid(v->constbase.tfbo, i));
1953
1954		/* 24-bit should be enough: */
1955		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
1956				create_immed(ctx->block, stride * 4), 0);
1957
1958		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
1959	}
1960
1961	/* Generate the per-output store instructions: */
1962	for (unsigned i = 0; i < strmout->num_outputs; i++) {
1963		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
1964			unsigned c = j + strmout->output[i].start_component;
1965			struct ir3_instruction *base, *out, *stg;
1966
1967			base = bases[strmout->output[i].output_buffer];
1968			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
1969
1970			stg = ir3_STG(ctx->block, base, 0, out, 0,
1971					create_immed(ctx->block, 1), 0);
1972			stg->cat6.type = TYPE_U32;
1973			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
1974
1975			array_insert(ctx->ir->keeps, stg);
1976		}
1977	}
1978
1979	/* and finally switch to the new_end_block: */
1980	ctx->block = new_end_block;
1981}
1982
1983static void
1984emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
1985{
1986	nir_metadata_require(impl, nir_metadata_block_index);
1987
1988	emit_cf_list(ctx, &impl->body);
1989	emit_block(ctx, impl->end_block);
1990
1991	/* at this point, we should have a single empty block,
1992	 * into which we emit the 'end' instruction.
1993	 */
1994	compile_assert(ctx, list_empty(&ctx->block->instr_list));
1995
1996	/* If stream-out (aka transform-feedback) enabled, emit the
1997	 * stream-out instructions, followed by a new empty block (into
1998	 * which the 'end' instruction lands).
1999	 *
2000	 * NOTE: it is done in this order, rather than inserting before
2001	 * we emit end_block, because NIR guarantees that all blocks
2002	 * flow into end_block, and that end_block has no successors.
2003	 * So by re-purposing end_block as the first block of stream-
2004	 * out, we guarantee that all exit paths flow into the stream-
2005	 * out instructions.
2006	 */
2007	if ((ctx->compiler->gpu_id < 500) &&
2008			(ctx->so->shader->stream_output.num_outputs > 0) &&
2009			!ctx->so->key.binning_pass) {
2010		debug_assert(ctx->so->type == SHADER_VERTEX);
2011		emit_stream_out(ctx);
2012	}
2013
2014	ir3_END(ctx->block);
2015}
2016
2017static void
2018setup_input(struct ir3_compile *ctx, nir_variable *in)
2019{
2020	struct ir3_shader_variant *so = ctx->so;
2021	unsigned array_len = MAX2(glsl_get_length(in->type), 1);
2022	unsigned ncomp = glsl_get_components(in->type);
2023	unsigned n = in->data.driver_location;
2024	unsigned slot = in->data.location;
2025
2026	DBG("; in: slot=%u, len=%ux%u, drvloc=%u",
2027			slot, array_len, ncomp, n);
2028
2029	/* let's pretend things other than vec4 don't exist: */
2030	ncomp = MAX2(ncomp, 4);
2031	compile_assert(ctx, ncomp == 4);
2032
2033	so->inputs[n].slot = slot;
2034	so->inputs[n].compmask = (1 << ncomp) - 1;
2035	so->inputs_count = MAX2(so->inputs_count, n + 1);
2036	so->inputs[n].interpolate = in->data.interpolation;
2037
2038	if (ctx->so->type == SHADER_FRAGMENT) {
2039		for (int i = 0; i < ncomp; i++) {
2040			struct ir3_instruction *instr = NULL;
2041			unsigned idx = (n * 4) + i;
2042
2043			if (slot == VARYING_SLOT_POS) {
2044				so->inputs[n].bary = false;
2045				so->frag_coord = true;
2046				instr = create_frag_coord(ctx, i);
2047			} else if (slot == VARYING_SLOT_PNTC) {
2048				/* see for example st_get_generic_varying_index().. this is
2049				 * maybe a bit mesa/st specific.  But we need things to line
2050				 * up for this in fdN_program:
2051				 *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
2052				 *    if (emit->sprite_coord_enable & texmask) {
2053				 *       ...
2054				 *    }
2055				 */
2056				so->inputs[n].slot = VARYING_SLOT_VAR8;
2057				so->inputs[n].bary = true;
2058				instr = create_frag_input(ctx, false);
2059			} else {
2060				bool use_ldlv = false;
2061
2062				/* detect the special case for front/back colors where
2063				 * we need to do flat vs smooth shading depending on
2064				 * rast state:
2065				 */
2066				if (in->data.interpolation == INTERP_MODE_NONE) {
2067					switch (slot) {
2068					case VARYING_SLOT_COL0:
2069					case VARYING_SLOT_COL1:
2070					case VARYING_SLOT_BFC0:
2071					case VARYING_SLOT_BFC1:
2072						so->inputs[n].rasterflat = true;
2073						break;
2074					default:
2075						break;
2076					}
2077				}
2078
2079				if (ctx->flat_bypass) {
2080					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
2081							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
2082						use_ldlv = true;
2083				}
2084
2085				so->inputs[n].bary = true;
2086
2087				instr = create_frag_input(ctx, use_ldlv);
2088			}
2089
2090			compile_assert(ctx, idx < ctx->ir->ninputs);
2091
2092			ctx->ir->inputs[idx] = instr;
2093		}
2094	} else if (ctx->so->type == SHADER_VERTEX) {
2095		for (int i = 0; i < ncomp; i++) {
2096			unsigned idx = (n * 4) + i;
2097			compile_assert(ctx, idx < ctx->ir->ninputs);
2098			ctx->ir->inputs[idx] = create_input(ctx->block, idx);
2099		}
2100	} else {
2101		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
2102	}
2103
2104	if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
2105		so->total_in += ncomp;
2106	}
2107}
2108
2109static void
2110setup_output(struct ir3_compile *ctx, nir_variable *out)
2111{
2112	struct ir3_shader_variant *so = ctx->so;
2113	unsigned array_len = MAX2(glsl_get_length(out->type), 1);
2114	unsigned ncomp = glsl_get_components(out->type);
2115	unsigned n = out->data.driver_location;
2116	unsigned slot = out->data.location;
2117	unsigned comp = 0;
2118
2119	DBG("; out: slot=%u, len=%ux%u, drvloc=%u",
2120			slot, array_len, ncomp, n);
2121
2122	/* let's pretend things other than vec4 don't exist: */
2123	ncomp = MAX2(ncomp, 4);
2124	compile_assert(ctx, ncomp == 4);
2125
2126	if (ctx->so->type == SHADER_FRAGMENT) {
2127		switch (slot) {
2128		case FRAG_RESULT_DEPTH:
2129			comp = 2;  /* tgsi will write to .z component */
2130			so->writes_pos = true;
2131			break;
2132		case FRAG_RESULT_COLOR:
2133			so->color0_mrt = 1;
2134			break;
2135		default:
2136			if (slot >= FRAG_RESULT_DATA0)
2137				break;
2138			compile_error(ctx, "unknown FS output name: %s\n",
2139					gl_frag_result_name(slot));
2140		}
2141	} else if (ctx->so->type == SHADER_VERTEX) {
2142		switch (slot) {
2143		case VARYING_SLOT_POS:
2144			so->writes_pos = true;
2145			break;
2146		case VARYING_SLOT_PSIZ:
2147			so->writes_psize = true;
2148			break;
2149		case VARYING_SLOT_COL0:
2150		case VARYING_SLOT_COL1:
2151		case VARYING_SLOT_BFC0:
2152		case VARYING_SLOT_BFC1:
2153		case VARYING_SLOT_FOGC:
2154		case VARYING_SLOT_CLIP_DIST0:
2155		case VARYING_SLOT_CLIP_DIST1:
2156		case VARYING_SLOT_CLIP_VERTEX:
2157			break;
2158		default:
2159			if (slot >= VARYING_SLOT_VAR0)
2160				break;
2161			if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
2162				break;
2163			compile_error(ctx, "unknown VS output name: %s\n",
2164					gl_varying_slot_name(slot));
2165		}
2166	} else {
2167		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
2168	}
2169
2170	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
2171
2172	so->outputs[n].slot = slot;
2173	so->outputs[n].regid = regid(n, comp);
2174	so->outputs_count = MAX2(so->outputs_count, n + 1);
2175
2176	for (int i = 0; i < ncomp; i++) {
2177		unsigned idx = (n * 4) + i;
2178		compile_assert(ctx, idx < ctx->ir->noutputs);
2179		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
2180	}
2181}
2182
2183static int
2184max_drvloc(struct exec_list *vars)
2185{
2186	int drvloc = -1;
2187	nir_foreach_variable(var, vars) {
2188		drvloc = MAX2(drvloc, (int)var->data.driver_location);
2189	}
2190	return drvloc;
2191}
2192
2193static void
2194emit_instructions(struct ir3_compile *ctx)
2195{
2196	unsigned ninputs, noutputs;
2197	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
2198
2199	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
2200	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
2201
2202	/* or vtx shaders, we need to leave room for sysvals:
2203	 */
2204	if (ctx->so->type == SHADER_VERTEX) {
2205		ninputs += 16;
2206	}
2207
2208	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
2209
2210	/* Create inputs in first block: */
2211	ctx->block = get_block(ctx, nir_start_block(fxn));
2212	ctx->in_block = ctx->block;
2213	list_addtail(&ctx->block->node, &ctx->ir->block_list);
2214
2215	if (ctx->so->type == SHADER_VERTEX) {
2216		ctx->ir->ninputs -= 16;
2217	}
2218
2219	/* for fragment shader, we have a single input register (usually
2220	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
2221	 */
2222	if (ctx->so->type == SHADER_FRAGMENT) {
2223		// TODO maybe a helper for fi since we need it a few places..
2224		struct ir3_instruction *instr;
2225		instr = ir3_instr_create(ctx->block, OPC_META_FI);
2226		ir3_reg_create(instr, 0, 0);
2227		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
2228		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
2229		ctx->frag_pos = instr;
2230	}
2231
2232	/* Setup inputs: */
2233	nir_foreach_variable(var, &ctx->s->inputs) {
2234		setup_input(ctx, var);
2235	}
2236
2237	/* Setup outputs: */
2238	nir_foreach_variable(var, &ctx->s->outputs) {
2239		setup_output(ctx, var);
2240	}
2241
2242	/* Setup global variables (which should only be arrays): */
2243	nir_foreach_variable(var, &ctx->s->globals) {
2244		declare_var(ctx, var);
2245	}
2246
2247	/* Setup local variables (which should only be arrays): */
2248	/* NOTE: need to do something more clever when we support >1 fxn */
2249	nir_foreach_variable(var, &fxn->locals) {
2250		declare_var(ctx, var);
2251	}
2252
2253	/* And emit the body: */
2254	ctx->impl = fxn;
2255	emit_function(ctx, fxn);
2256
2257	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
2258		resolve_phis(ctx, block);
2259	}
2260}
2261
2262/* from NIR perspective, we actually have inputs.  But most of the "inputs"
2263 * for a fragment shader are just bary.f instructions.  The *actual* inputs
2264 * from the hw perspective are the frag_pos and optionally frag_coord and
2265 * frag_face.
2266 */
2267static void
2268fixup_frag_inputs(struct ir3_compile *ctx)
2269{
2270	struct ir3_shader_variant *so = ctx->so;
2271	struct ir3 *ir = ctx->ir;
2272	struct ir3_instruction **inputs;
2273	struct ir3_instruction *instr;
2274	int n, regid = 0;
2275
2276	ir->ninputs = 0;
2277
2278	n  = 4;  /* always have frag_pos */
2279	n += COND(so->frag_face, 4);
2280	n += COND(so->frag_coord, 4);
2281
2282	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
2283
2284	if (so->frag_face) {
2285		/* this ultimately gets assigned to hr0.x so doesn't conflict
2286		 * with frag_coord/frag_pos..
2287		 */
2288		inputs[ir->ninputs++] = ctx->frag_face;
2289		ctx->frag_face->regs[0]->num = 0;
2290
2291		/* remaining channels not used, but let's avoid confusing
2292		 * other parts that expect inputs to come in groups of vec4
2293		 */
2294		inputs[ir->ninputs++] = NULL;
2295		inputs[ir->ninputs++] = NULL;
2296		inputs[ir->ninputs++] = NULL;
2297	}
2298
2299	/* since we don't know where to set the regid for frag_coord,
2300	 * we have to use r0.x for it.  But we don't want to *always*
2301	 * use r1.x for frag_pos as that could increase the register
2302	 * footprint on simple shaders:
2303	 */
2304	if (so->frag_coord) {
2305		ctx->frag_coord[0]->regs[0]->num = regid++;
2306		ctx->frag_coord[1]->regs[0]->num = regid++;
2307		ctx->frag_coord[2]->regs[0]->num = regid++;
2308		ctx->frag_coord[3]->regs[0]->num = regid++;
2309
2310		inputs[ir->ninputs++] = ctx->frag_coord[0];
2311		inputs[ir->ninputs++] = ctx->frag_coord[1];
2312		inputs[ir->ninputs++] = ctx->frag_coord[2];
2313		inputs[ir->ninputs++] = ctx->frag_coord[3];
2314	}
2315
2316	/* we always have frag_pos: */
2317	so->pos_regid = regid;
2318
2319	/* r0.x */
2320	instr = create_input(ctx->in_block, ir->ninputs);
2321	instr->regs[0]->num = regid++;
2322	inputs[ir->ninputs++] = instr;
2323	ctx->frag_pos->regs[1]->instr = instr;
2324
2325	/* r0.y */
2326	instr = create_input(ctx->in_block, ir->ninputs);
2327	instr->regs[0]->num = regid++;
2328	inputs[ir->ninputs++] = instr;
2329	ctx->frag_pos->regs[2]->instr = instr;
2330
2331	ir->inputs = inputs;
2332}
2333
2334/* Fixup tex sampler state for astc/srgb workaround instructions.  We
2335 * need to assign the tex state indexes for these after we know the
2336 * max tex index.
2337 */
2338static void
2339fixup_astc_srgb(struct ir3_compile *ctx)
2340{
2341	struct ir3_shader_variant *so = ctx->so;
2342	/* indexed by original tex idx, value is newly assigned alpha sampler
2343	 * state tex idx.  Zero is invalid since there is at least one sampler
2344	 * if we get here.
2345	 */
2346	unsigned alt_tex_state[16] = {0};
2347	unsigned tex_idx = ctx->max_texture_index + 1;
2348	unsigned idx = 0;
2349
2350	so->astc_srgb.base = tex_idx;
2351
2352	for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
2353		struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
2354
2355		compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
2356
2357		if (alt_tex_state[sam->cat5.tex] == 0) {
2358			/* assign new alternate/alpha tex state slot: */
2359			alt_tex_state[sam->cat5.tex] = tex_idx++;
2360			so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
2361			so->astc_srgb.count++;
2362		}
2363
2364		sam->cat5.tex = alt_tex_state[sam->cat5.tex];
2365	}
2366}
2367
2368int
2369ir3_compile_shader_nir(struct ir3_compiler *compiler,
2370		struct ir3_shader_variant *so)
2371{
2372	struct ir3_compile *ctx;
2373	struct ir3 *ir;
2374	struct ir3_instruction **inputs;
2375	unsigned i, j, actual_in, inloc;
2376	int ret = 0, max_bary;
2377
2378	assert(!so->ir);
2379
2380	ctx = compile_init(compiler, so);
2381	if (!ctx) {
2382		DBG("INIT failed!");
2383		ret = -1;
2384		goto out;
2385	}
2386
2387	emit_instructions(ctx);
2388
2389	if (ctx->error) {
2390		DBG("EMIT failed!");
2391		ret = -1;
2392		goto out;
2393	}
2394
2395	ir = so->ir = ctx->ir;
2396
2397	/* keep track of the inputs from TGSI perspective.. */
2398	inputs = ir->inputs;
2399
2400	/* but fixup actual inputs for frag shader: */
2401	if (so->type == SHADER_FRAGMENT)
2402		fixup_frag_inputs(ctx);
2403
2404	/* at this point, for binning pass, throw away unneeded outputs: */
2405	if (so->key.binning_pass) {
2406		for (i = 0, j = 0; i < so->outputs_count; i++) {
2407			unsigned slot = so->outputs[i].slot;
2408
2409			/* throw away everything but first position/psize */
2410			if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
2411				if (i != j) {
2412					so->outputs[j] = so->outputs[i];
2413					ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
2414					ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
2415					ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
2416					ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
2417				}
2418				j++;
2419			}
2420		}
2421		so->outputs_count = j;
2422		ir->noutputs = j * 4;
2423	}
2424
2425	/* if we want half-precision outputs, mark the output registers
2426	 * as half:
2427	 */
2428	if (so->key.half_precision) {
2429		for (i = 0; i < ir->noutputs; i++) {
2430			struct ir3_instruction *out = ir->outputs[i];
2431			if (!out)
2432				continue;
2433			out->regs[0]->flags |= IR3_REG_HALF;
2434			/* output could be a fanout (ie. texture fetch output)
2435			 * in which case we need to propagate the half-reg flag
2436			 * up to the definer so that RA sees it:
2437			 */
2438			if (out->opc == OPC_META_FO) {
2439				out = out->regs[1]->instr;
2440				out->regs[0]->flags |= IR3_REG_HALF;
2441			}
2442
2443			if (out->opc == OPC_MOV) {
2444				out->cat1.dst_type = half_type(out->cat1.dst_type);
2445			}
2446		}
2447	}
2448
2449	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2450		printf("BEFORE CP:\n");
2451		ir3_print(ir);
2452	}
2453
2454	ir3_cp(ir, so);
2455
2456	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2457		printf("BEFORE GROUPING:\n");
2458		ir3_print(ir);
2459	}
2460
2461	/* Group left/right neighbors, inserting mov's where needed to
2462	 * solve conflicts:
2463	 */
2464	ir3_group(ir);
2465
2466	ir3_depth(ir);
2467
2468	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2469		printf("AFTER DEPTH:\n");
2470		ir3_print(ir);
2471	}
2472
2473	ret = ir3_sched(ir);
2474	if (ret) {
2475		DBG("SCHED failed!");
2476		goto out;
2477	}
2478
2479	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2480		printf("AFTER SCHED:\n");
2481		ir3_print(ir);
2482	}
2483
2484	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
2485	if (ret) {
2486		DBG("RA failed!");
2487		goto out;
2488	}
2489
2490	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2491		printf("AFTER RA:\n");
2492		ir3_print(ir);
2493	}
2494
2495	/* fixup input/outputs: */
2496	for (i = 0; i < so->outputs_count; i++) {
2497		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
2498	}
2499
2500	/* Note that some or all channels of an input may be unused: */
2501	actual_in = 0;
2502	inloc = 0;
2503	for (i = 0; i < so->inputs_count; i++) {
2504		unsigned j, regid = ~0, compmask = 0, maxcomp = 0;
2505		so->inputs[i].ncomp = 0;
2506		so->inputs[i].inloc = inloc;
2507		for (j = 0; j < 4; j++) {
2508			struct ir3_instruction *in = inputs[(i*4) + j];
2509			if (in && !(in->flags & IR3_INSTR_UNUSED)) {
2510				compmask |= (1 << j);
2511				regid = in->regs[0]->num - j;
2512				actual_in++;
2513				so->inputs[i].ncomp++;
2514				if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) {
2515					/* assign inloc: */
2516					assert(in->regs[1]->flags & IR3_REG_IMMED);
2517					in->regs[1]->iim_val = inloc + j;
2518					maxcomp = j + 1;
2519				}
2520			}
2521		}
2522		if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
2523			so->varying_in++;
2524			so->inputs[i].compmask = (1 << maxcomp) - 1;
2525			inloc += maxcomp;
2526		} else {
2527			so->inputs[i].compmask = compmask;
2528		}
2529		so->inputs[i].regid = regid;
2530	}
2531
2532	if (ctx->astc_srgb)
2533		fixup_astc_srgb(ctx);
2534
2535	/* We need to do legalize after (for frag shader's) the "bary.f"
2536	 * offsets (inloc) have been assigned.
2537	 */
2538	ir3_legalize(ir, &so->has_samp, &max_bary);
2539
2540	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2541		printf("AFTER LEGALIZE:\n");
2542		ir3_print(ir);
2543	}
2544
2545	/* Note that actual_in counts inputs that are not bary.f'd for FS: */
2546	if (so->type == SHADER_VERTEX)
2547		so->total_in = actual_in;
2548	else
2549		so->total_in = max_bary + 1;
2550
2551out:
2552	if (ret) {
2553		if (so->ir)
2554			ir3_destroy(so->ir);
2555		so->ir = NULL;
2556	}
2557	compile_free(ctx);
2558
2559	return ret;
2560}
2561