r600_asm.c revision da676eab93e7dad30b574b4eb4cffd4df952e819
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_opcodes.h"
25#include "r600_formats.h"
26#include "r600d.h"
27
28#include <errno.h>
29#include <byteswap.h>
30#include "util/u_memory.h"
31#include "pipe/p_shader_tokens.h"
32
33#define NUM_OF_CYCLES 3
34#define NUM_OF_COMPONENTS 4
35
36static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
37{
38	if(alu->is_op3)
39		return 3;
40
41	switch (bc->chip_class) {
42	case R600:
43	case R700:
44		switch (alu->inst) {
45		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
46			return 0;
47		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
48		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
49		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
50		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
51		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
52		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
53		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
54		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
55		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
56		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
57		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE:
58		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
59		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
60		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
61		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
62		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
63		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
64		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
65		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
66		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
67		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
68		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
69		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
70		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
71		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
72		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
73		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
74		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
75		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
76		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
77		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
78		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
79		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
80		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
81		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
82		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
83		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
84		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
85		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
86		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
87		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
88		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT:
89		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT:
90		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT:
91			return 2;
92
93		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
94		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
95		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
96		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
97		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
98		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
99		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
100		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
101		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
102		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
103		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
104		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
105		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
106		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
107		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
108		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
109		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
110		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
111		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
112		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
113		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT:
114		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT:
115		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
116		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
117		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
118		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
119			return 1;
120		default: R600_ERR(
121			"Need instruction operand number for 0x%x.\n", alu->inst);
122		}
123		break;
124	case EVERGREEN:
125	case CAYMAN:
126		switch (alu->inst) {
127		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
128			return 0;
129		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
130		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
131		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
132		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
133		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
134		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
135		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
136		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
137		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
138		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
139		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE:
140		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
141		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
142		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
143		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
144		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
145		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
146		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
147		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
148		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
149		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
150		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
151		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
152		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
153		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
154		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
155		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
156		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
157		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
158		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
159		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
160		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
161		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
162		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
163		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
164		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
165		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
166		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
167		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
168		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
169		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY:
170		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW:
171		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
172		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT:
173		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT:
174		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT:
175			return 2;
176
177		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
178		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
179		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
180		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
181		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
182		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
183		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
184		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
185		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
186		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
187		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
188		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
189		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
190		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
191		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
192		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
193		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT:
194		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT:
195		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
196		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
197		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
198		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
199		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0:
200		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
201		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
202			return 1;
203		default: R600_ERR(
204			"Need instruction operand number for 0x%x.\n", alu->inst);
205		}
206		break;
207	}
208
209	return 3;
210}
211
212int r700_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id);
213
214static struct r600_bytecode_cf *r600_bytecode_cf(void)
215{
216	struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
217
218	if (cf == NULL)
219		return NULL;
220	LIST_INITHEAD(&cf->list);
221	LIST_INITHEAD(&cf->alu);
222	LIST_INITHEAD(&cf->vtx);
223	LIST_INITHEAD(&cf->tex);
224	return cf;
225}
226
227static struct r600_bytecode_alu *r600_bytecode_alu(void)
228{
229	struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
230
231	if (alu == NULL)
232		return NULL;
233	LIST_INITHEAD(&alu->list);
234	return alu;
235}
236
237static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
238{
239	struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
240
241	if (vtx == NULL)
242		return NULL;
243	LIST_INITHEAD(&vtx->list);
244	return vtx;
245}
246
247static struct r600_bytecode_tex *r600_bytecode_tex(void)
248{
249	struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
250
251	if (tex == NULL)
252		return NULL;
253	LIST_INITHEAD(&tex->list);
254	return tex;
255}
256
257void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
258{
259	if ((chip_class == R600) && (family != CHIP_RV670))
260		bc->ar_handling = AR_HANDLE_RV6XX;
261	else
262		bc->ar_handling = AR_HANDLE_NORMAL;
263
264	if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 &&
265					   family != CHIP_RS880))
266		bc->r6xx_nop_after_rel_dst = 1;
267	else
268		bc->r6xx_nop_after_rel_dst = 0;
269	LIST_INITHEAD(&bc->cf);
270	bc->chip_class = chip_class;
271}
272
273static int r600_bytecode_add_cf(struct r600_bytecode *bc)
274{
275	struct r600_bytecode_cf *cf = r600_bytecode_cf();
276
277	if (cf == NULL)
278		return -ENOMEM;
279	LIST_ADDTAIL(&cf->list, &bc->cf);
280	if (bc->cf_last) {
281		cf->id = bc->cf_last->id + 2;
282		if (bc->cf_last->eg_alu_extended) {
283			/* take into account extended alu size */
284			cf->id += 2;
285			bc->ndw += 2;
286		}
287	}
288	bc->cf_last = cf;
289	bc->ncf++;
290	bc->ndw += 2;
291	bc->force_add_cf = 0;
292	bc->ar_loaded = 0;
293	return 0;
294}
295
296int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output)
297{
298	int r;
299
300	if (output->gpr >= bc->ngpr)
301		bc->ngpr = output->gpr + 1;
302
303	if (bc->cf_last && (bc->cf_last->inst == output->inst ||
304		(bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
305		output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
306		output->type == bc->cf_last->output.type &&
307		output->elem_size == bc->cf_last->output.elem_size &&
308		output->swizzle_x == bc->cf_last->output.swizzle_x &&
309		output->swizzle_y == bc->cf_last->output.swizzle_y &&
310		output->swizzle_z == bc->cf_last->output.swizzle_z &&
311		output->swizzle_w == bc->cf_last->output.swizzle_w &&
312		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
313
314		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
315			(output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
316
317			bc->cf_last->output.end_of_program |= output->end_of_program;
318			bc->cf_last->output.inst = output->inst;
319			bc->cf_last->output.gpr = output->gpr;
320			bc->cf_last->output.array_base = output->array_base;
321			bc->cf_last->output.burst_count += output->burst_count;
322			return 0;
323
324		} else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
325			output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
326
327			bc->cf_last->output.end_of_program |= output->end_of_program;
328			bc->cf_last->output.inst = output->inst;
329			bc->cf_last->output.burst_count += output->burst_count;
330			return 0;
331		}
332	}
333
334	r = r600_bytecode_add_cf(bc);
335	if (r)
336		return r;
337	bc->cf_last->inst = output->inst;
338	memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
339	return 0;
340}
341
342/* alu instructions that can ony exits once per group */
343static int is_alu_once_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
344{
345	switch (bc->chip_class) {
346	case R600:
347	case R700:
348		return !alu->is_op3 && (
349			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
350			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
351			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
352			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
353			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
354			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
355			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
356			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
357			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
358			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
359			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
360			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
361			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
362			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
363			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
364			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
365			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
366			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
367			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
368			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
369			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
370			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
371			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
372			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
373			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
374			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
375			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
376			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
377			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
378			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
379			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
380			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
381			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
382			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
383	case EVERGREEN:
384	case CAYMAN:
385	default:
386		return !alu->is_op3 && (
387			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
388			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
389			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
390			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
391			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
392			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
393			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
394			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
395			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
396			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
397			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
398			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
399			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
400			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
401			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
402			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
403			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
404			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
405			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
406			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
407			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
408			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
409			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
410			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
411			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
412			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
413			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
414			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
415			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
416			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
417			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
418			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
419			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
420			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
421	}
422}
423
424static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
425{
426	switch (bc->chip_class) {
427	case R600:
428	case R700:
429		return !alu->is_op3 && (
430			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
431			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
432			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
433			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
434	case EVERGREEN:
435	case CAYMAN:
436	default:
437		return !alu->is_op3 && (
438			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
439			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
440			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
441			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
442	}
443}
444
445static int is_alu_cube_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
446{
447	switch (bc->chip_class) {
448	case R600:
449	case R700:
450		return !alu->is_op3 &&
451			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
452	case EVERGREEN:
453	case CAYMAN:
454	default:
455		return !alu->is_op3 &&
456			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
457	}
458}
459
460static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
461{
462	switch (bc->chip_class) {
463	case R600:
464	case R700:
465		return !alu->is_op3 && (
466			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
467			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
468			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
469			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
470	case EVERGREEN:
471	case CAYMAN:
472	default:
473		return !alu->is_op3 && (
474			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
475	}
476}
477
478static int is_opcode_in_range(unsigned opcode, unsigned min, unsigned max)
479{
480	return min <= opcode && opcode <= max;
481}
482
483/* ALU instructions that can only execute on the vector unit:
484 *
485 * opcode ranges:
486 * R6xx/R7xx:
487 *   op3 : [0x08 - 0x0B]
488 *   op2 : 0x07, [0x15 - 0x18], [0x1B - 0x1D], [0x50 - 0x53], [0x7A - 0x7E]
489 *
490 * EVERGREEN:
491 *   op3: [0x04 - 0x11]
492 *   op2: [0xA0 - 0xE2]
493 */
494static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
495{
496	switch (bc->chip_class) {
497	case R600:
498	case R700:
499		if (alu->is_op3)
500			return is_opcode_in_range(alu->inst,
501					V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_64,
502					V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_64_D2);
503		else
504			return (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FREXP_64) ||
505					is_opcode_in_range(alu->inst,
506						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA,
507						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT) ||
508					is_opcode_in_range(alu->inst,
509						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_64,
510						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT32_TO_FLT64) ||
511					is_opcode_in_range(alu->inst,
512						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4,
513						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4) ||
514					is_opcode_in_range(alu->inst,
515						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LDEXP_64,
516						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_64);
517
518	case EVERGREEN:
519		if (alu->is_op3)
520			return is_opcode_in_range(alu->inst,
521					EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_BFE_UINT,
522					EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_LDS_IDX_OP);
523		else
524			return is_opcode_in_range(alu->inst,
525					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_BFM_INT,
526					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P20);
527	case CAYMAN:
528	default:
529		assert(0);
530		return 0;
531	}
532}
533
534/* ALU instructions that can only execute on the trans unit:
535 *
536 * opcode ranges:
537 * R600:
538 *   op3: 0x0C
539 *   op2: [0x60 - 0x79]
540 *
541 * R700:
542 *   op3: 0x0C
543 *   op2: [0x60 - 0x6F], [0x73 - 0x79]
544 *
545 * EVERGREEN:
546 *   op3: 0x1F
547 *   op2: [0x81 - 0x9C]
548 */
549static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
550{
551
552	switch (bc->chip_class) {
553	case R600:
554		if (alu->is_op3)
555			return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
556		else
557			return is_opcode_in_range(alu->inst,
558					V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,
559					V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
560	case R700:
561		if (alu->is_op3)
562			return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
563		else
564			return is_opcode_in_range(alu->inst,
565						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,
566						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS) ||
567					is_opcode_in_range(alu->inst,
568							V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT,
569							V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
570	case EVERGREEN:
571		if (alu->is_op3)
572			return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
573		else
574			return is_opcode_in_range(alu->inst,
575					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE,
576					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
577	case CAYMAN:
578	default:
579		assert(0);
580		return 0;
581	}
582}
583
584/* alu instructions that can execute on any unit */
585static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
586{
587	return !is_alu_vec_unit_inst(bc, alu) &&
588		!is_alu_trans_unit_inst(bc, alu);
589}
590
591static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
592{
593	switch (bc->chip_class) {
594	case R600:
595	case R700:
596		return (!alu->is_op3 && alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
597	case EVERGREEN:
598	case CAYMAN:
599	default:
600		return (!alu->is_op3 && alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
601	}
602}
603
604static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
605			    struct r600_bytecode_alu *assignment[5])
606{
607	struct r600_bytecode_alu *alu;
608	unsigned i, chan, trans;
609	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
610
611	for (i = 0; i < max_slots; i++)
612		assignment[i] = NULL;
613
614	for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
615		chan = alu->dst.chan;
616		if (max_slots == 4)
617			trans = 0;
618		else if (is_alu_trans_unit_inst(bc, alu))
619			trans = 1;
620		else if (is_alu_vec_unit_inst(bc, alu))
621			trans = 0;
622		else if (assignment[chan])
623			trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
624		else
625			trans = 0;
626
627		if (trans) {
628			if (assignment[4]) {
629				assert(0); /* ALU.Trans has already been allocated. */
630				return -1;
631			}
632			assignment[4] = alu;
633		} else {
634			if (assignment[chan]) {
635				assert(0); /* ALU.chan has already been allocated. */
636				return -1;
637			}
638			assignment[chan] = alu;
639		}
640
641		if (alu->last)
642			break;
643	}
644	return 0;
645}
646
647struct alu_bank_swizzle {
648	int	hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
649	int	hw_cfile_addr[4];
650	int	hw_cfile_elem[4];
651};
652
653static const unsigned cycle_for_bank_swizzle_vec[][3] = {
654	[SQ_ALU_VEC_012] = { 0, 1, 2 },
655	[SQ_ALU_VEC_021] = { 0, 2, 1 },
656	[SQ_ALU_VEC_120] = { 1, 2, 0 },
657	[SQ_ALU_VEC_102] = { 1, 0, 2 },
658	[SQ_ALU_VEC_201] = { 2, 0, 1 },
659	[SQ_ALU_VEC_210] = { 2, 1, 0 }
660};
661
662static const unsigned cycle_for_bank_swizzle_scl[][3] = {
663	[SQ_ALU_SCL_210] = { 2, 1, 0 },
664	[SQ_ALU_SCL_122] = { 1, 2, 2 },
665	[SQ_ALU_SCL_212] = { 2, 1, 2 },
666	[SQ_ALU_SCL_221] = { 2, 2, 1 }
667};
668
669static void init_bank_swizzle(struct alu_bank_swizzle *bs)
670{
671	int i, cycle, component;
672	/* set up gpr use */
673	for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
674		for (component = 0; component < NUM_OF_COMPONENTS; component++)
675			 bs->hw_gpr[cycle][component] = -1;
676	for (i = 0; i < 4; i++)
677		bs->hw_cfile_addr[i] = -1;
678	for (i = 0; i < 4; i++)
679		bs->hw_cfile_elem[i] = -1;
680}
681
682static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
683{
684	if (bs->hw_gpr[cycle][chan] == -1)
685		bs->hw_gpr[cycle][chan] = sel;
686	else if (bs->hw_gpr[cycle][chan] != (int)sel) {
687		/* Another scalar operation has already used the GPR read port for the channel. */
688		return -1;
689	}
690	return 0;
691}
692
693static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
694{
695	int res, num_res = 4;
696	if (bc->chip_class >= R700) {
697		num_res = 2;
698		chan /= 2;
699	}
700	for (res = 0; res < num_res; ++res) {
701		if (bs->hw_cfile_addr[res] == -1) {
702			bs->hw_cfile_addr[res] = sel;
703			bs->hw_cfile_elem[res] = chan;
704			return 0;
705		} else if (bs->hw_cfile_addr[res] == sel &&
706			bs->hw_cfile_elem[res] == chan)
707			return 0; /* Read for this scalar element already reserved, nothing to do here. */
708	}
709	/* All cfile read ports are used, cannot reference vector element. */
710	return -1;
711}
712
713static int is_gpr(unsigned sel)
714{
715	return (sel >= 0 && sel <= 127);
716}
717
718/* CB constants start at 512, and get translated to a kcache index when ALU
719 * clauses are constructed. Note that we handle kcache constants the same way
720 * as (the now gone) cfile constants, is that really required? */
721static int is_cfile(unsigned sel)
722{
723	return (sel > 255 && sel < 512) ||
724		(sel > 511 && sel < 4607) || /* Kcache before translation. */
725		(sel > 127 && sel < 192); /* Kcache after translation. */
726}
727
728static int is_const(int sel)
729{
730	return is_cfile(sel) ||
731		(sel >= V_SQ_ALU_SRC_0 &&
732		sel <= V_SQ_ALU_SRC_LITERAL);
733}
734
735static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
736			struct alu_bank_swizzle *bs, int bank_swizzle)
737{
738	int r, src, num_src, sel, elem, cycle;
739
740	num_src = r600_bytecode_get_num_operands(bc, alu);
741	for (src = 0; src < num_src; src++) {
742		sel = alu->src[src].sel;
743		elem = alu->src[src].chan;
744		if (is_gpr(sel)) {
745			cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
746			if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
747				/* Nothing to do; special-case optimization,
748				 * second source uses first source’s reservation. */
749				continue;
750			else {
751				r = reserve_gpr(bs, sel, elem, cycle);
752				if (r)
753					return r;
754			}
755		} else if (is_cfile(sel)) {
756			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
757			if (r)
758				return r;
759		}
760		/* No restrictions on PV, PS, literal or special constants. */
761	}
762	return 0;
763}
764
765static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
766			struct alu_bank_swizzle *bs, int bank_swizzle)
767{
768	int r, src, num_src, const_count, sel, elem, cycle;
769
770	num_src = r600_bytecode_get_num_operands(bc, alu);
771	for (const_count = 0, src = 0; src < num_src; ++src) {
772		sel = alu->src[src].sel;
773		elem = alu->src[src].chan;
774		if (is_const(sel)) { /* Any constant, including literal and inline constants. */
775			if (const_count >= 2)
776				/* More than two references to a constant in
777				 * transcendental operation. */
778				return -1;
779			else
780				const_count++;
781		}
782		if (is_cfile(sel)) {
783			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
784			if (r)
785				return r;
786		}
787	}
788	for (src = 0; src < num_src; ++src) {
789		sel = alu->src[src].sel;
790		elem = alu->src[src].chan;
791		if (is_gpr(sel)) {
792			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
793			if (cycle < const_count)
794				/* Cycle for GPR load conflicts with
795				 * constant load in transcendental operation. */
796				return -1;
797			r = reserve_gpr(bs, sel, elem, cycle);
798			if (r)
799				return r;
800		}
801		/* PV PS restrictions */
802		if (const_count && (sel == 254 || sel == 255)) {
803			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
804			if (cycle < const_count)
805				return -1;
806		}
807	}
808	return 0;
809}
810
811static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
812				      struct r600_bytecode_alu *slots[5])
813{
814	struct alu_bank_swizzle bs;
815	int bank_swizzle[5];
816	int i, r = 0, forced = 1;
817	boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
818	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
819
820	for (i = 0; i < max_slots; i++) {
821		if (slots[i]) {
822			if (slots[i]->bank_swizzle_force) {
823				slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
824			} else {
825				forced = 0;
826			}
827		}
828
829		if (i < 4 && slots[i])
830			scalar_only = false;
831	}
832	if (forced)
833		return 0;
834
835	/* Just check every possible combination of bank swizzle.
836	 * Not very efficent, but works on the first try in most of the cases. */
837	for (i = 0; i < 4; i++)
838		if (!slots[i] || !slots[i]->bank_swizzle_force)
839			bank_swizzle[i] = SQ_ALU_VEC_012;
840		else
841			bank_swizzle[i] = slots[i]->bank_swizzle;
842
843	bank_swizzle[4] = SQ_ALU_SCL_210;
844	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
845
846		if (max_slots == 4) {
847			for (i = 0; i < max_slots; i++) {
848				if (bank_swizzle[i] == SQ_ALU_VEC_210)
849				  return -1;
850			}
851		}
852		init_bank_swizzle(&bs);
853		if (scalar_only == false) {
854			for (i = 0; i < 4; i++) {
855				if (slots[i]) {
856					r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
857					if (r)
858						break;
859				}
860			}
861		} else
862			r = 0;
863
864		if (!r && slots[4] && max_slots == 5) {
865			r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
866		}
867		if (!r) {
868			for (i = 0; i < max_slots; i++) {
869				if (slots[i])
870					slots[i]->bank_swizzle = bank_swizzle[i];
871			}
872			return 0;
873		}
874
875		if (scalar_only) {
876			bank_swizzle[4]++;
877		} else {
878			for (i = 0; i < max_slots; i++) {
879				if (!slots[i] || !slots[i]->bank_swizzle_force) {
880					bank_swizzle[i]++;
881					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
882						break;
883					else
884						bank_swizzle[i] = SQ_ALU_VEC_012;
885				}
886			}
887		}
888	}
889
890	/* Couldn't find a working swizzle. */
891	return -1;
892}
893
894static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
895				  struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
896{
897	struct r600_bytecode_alu *prev[5];
898	int gpr[5], chan[5];
899	int i, j, r, src, num_src;
900	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
901
902	r = assign_alu_units(bc, alu_prev, prev);
903	if (r)
904		return r;
905
906	for (i = 0; i < max_slots; ++i) {
907		if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
908			gpr[i] = prev[i]->dst.sel;
909			/* cube writes more than PV.X */
910			if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
911				chan[i] = 0;
912			else
913				chan[i] = prev[i]->dst.chan;
914		} else
915			gpr[i] = -1;
916	}
917
918	for (i = 0; i < max_slots; ++i) {
919		struct r600_bytecode_alu *alu = slots[i];
920		if(!alu)
921			continue;
922
923		num_src = r600_bytecode_get_num_operands(bc, alu);
924		for (src = 0; src < num_src; ++src) {
925			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
926				continue;
927
928			if (bc->chip_class < CAYMAN) {
929				if (alu->src[src].sel == gpr[4] &&
930				    alu->src[src].chan == chan[4]) {
931					alu->src[src].sel = V_SQ_ALU_SRC_PS;
932					alu->src[src].chan = 0;
933					continue;
934				}
935			}
936
937			for (j = 0; j < 4; ++j) {
938				if (alu->src[src].sel == gpr[j] &&
939					alu->src[src].chan == j) {
940					alu->src[src].sel = V_SQ_ALU_SRC_PV;
941					alu->src[src].chan = chan[j];
942					break;
943				}
944			}
945		}
946	}
947
948	return 0;
949}
950
951void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg)
952{
953	switch(value) {
954	case 0:
955		*sel = V_SQ_ALU_SRC_0;
956		break;
957	case 1:
958		*sel = V_SQ_ALU_SRC_1_INT;
959		break;
960	case -1:
961		*sel = V_SQ_ALU_SRC_M_1_INT;
962		break;
963	case 0x3F800000: /* 1.0f */
964		*sel = V_SQ_ALU_SRC_1;
965		break;
966	case 0x3F000000: /* 0.5f */
967		*sel = V_SQ_ALU_SRC_0_5;
968		break;
969	case 0xBF800000: /* -1.0f */
970		*sel = V_SQ_ALU_SRC_1;
971		*neg ^= 1;
972		break;
973	case 0xBF000000: /* -0.5f */
974		*sel = V_SQ_ALU_SRC_0_5;
975		*neg ^= 1;
976		break;
977	default:
978		*sel = V_SQ_ALU_SRC_LITERAL;
979		break;
980	}
981}
982
983/* compute how many literal are needed */
984static int r600_bytecode_alu_nliterals(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
985				 uint32_t literal[4], unsigned *nliteral)
986{
987	unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
988	unsigned i, j;
989
990	for (i = 0; i < num_src; ++i) {
991		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
992			uint32_t value = alu->src[i].value;
993			unsigned found = 0;
994			for (j = 0; j < *nliteral; ++j) {
995				if (literal[j] == value) {
996					found = 1;
997					break;
998				}
999			}
1000			if (!found) {
1001				if (*nliteral >= 4)
1002					return -EINVAL;
1003				literal[(*nliteral)++] = value;
1004			}
1005		}
1006	}
1007	return 0;
1008}
1009
1010static void r600_bytecode_alu_adjust_literals(struct r600_bytecode *bc,
1011					struct r600_bytecode_alu *alu,
1012					uint32_t literal[4], unsigned nliteral)
1013{
1014	unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
1015	unsigned i, j;
1016
1017	for (i = 0; i < num_src; ++i) {
1018		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1019			uint32_t value = alu->src[i].value;
1020			for (j = 0; j < nliteral; ++j) {
1021				if (literal[j] == value) {
1022					alu->src[i].chan = j;
1023					break;
1024				}
1025			}
1026		}
1027	}
1028}
1029
1030static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
1031			     struct r600_bytecode_alu *alu_prev)
1032{
1033	struct r600_bytecode_alu *prev[5];
1034	struct r600_bytecode_alu *result[5] = { NULL };
1035
1036	uint32_t literal[4], prev_literal[4];
1037	unsigned nliteral = 0, prev_nliteral = 0;
1038
1039	int i, j, r, src, num_src;
1040	int num_once_inst = 0;
1041	int have_mova = 0, have_rel = 0;
1042	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1043
1044	r = assign_alu_units(bc, alu_prev, prev);
1045	if (r)
1046		return r;
1047
1048	for (i = 0; i < max_slots; ++i) {
1049		struct r600_bytecode_alu *alu;
1050
1051		/* check number of literals */
1052		if (prev[i]) {
1053			if (r600_bytecode_alu_nliterals(bc, prev[i], literal, &nliteral))
1054				return 0;
1055			if (r600_bytecode_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
1056				return 0;
1057			if (is_alu_mova_inst(bc, prev[i])) {
1058				if (have_rel)
1059					return 0;
1060				have_mova = 1;
1061			}
1062			num_once_inst += is_alu_once_inst(bc, prev[i]);
1063		}
1064		if (slots[i] && r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral))
1065			return 0;
1066
1067		/* Let's check used slots. */
1068		if (prev[i] && !slots[i]) {
1069			result[i] = prev[i];
1070			continue;
1071		} else if (prev[i] && slots[i]) {
1072			if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
1073				/* Trans unit is still free try to use it. */
1074				if (is_alu_any_unit_inst(bc, slots[i])) {
1075					result[i] = prev[i];
1076					result[4] = slots[i];
1077				} else if (is_alu_any_unit_inst(bc, prev[i])) {
1078					if (slots[i]->dst.sel == prev[i]->dst.sel &&
1079						(slots[i]->dst.write == 1 || slots[i]->is_op3) &&
1080						(prev[i]->dst.write == 1 || prev[i]->is_op3))
1081						return 0;
1082
1083					result[i] = slots[i];
1084					result[4] = prev[i];
1085				} else
1086					return 0;
1087			} else
1088				return 0;
1089		} else if(!slots[i]) {
1090			continue;
1091		} else {
1092			if (max_slots == 5 && slots[i] && prev[4] &&
1093					slots[i]->dst.sel == prev[4]->dst.sel &&
1094					slots[i]->dst.chan == prev[4]->dst.chan &&
1095					(slots[i]->dst.write == 1 || slots[i]->is_op3) &&
1096					(prev[4]->dst.write == 1 || prev[4]->is_op3))
1097				return 0;
1098
1099			result[i] = slots[i];
1100		}
1101
1102		alu = slots[i];
1103		num_once_inst += is_alu_once_inst(bc, alu);
1104
1105		/* don't reschedule NOPs */
1106		if (is_nop_inst(bc, alu))
1107			return 0;
1108
1109		/* Let's check dst gpr. */
1110		if (alu->dst.rel) {
1111			if (have_mova)
1112				return 0;
1113			have_rel = 1;
1114		}
1115
1116		/* Let's check source gprs */
1117		num_src = r600_bytecode_get_num_operands(bc, alu);
1118		for (src = 0; src < num_src; ++src) {
1119			if (alu->src[src].rel) {
1120				if (have_mova)
1121					return 0;
1122				have_rel = 1;
1123			}
1124
1125			/* Constants don't matter. */
1126			if (!is_gpr(alu->src[src].sel))
1127				continue;
1128
1129			for (j = 0; j < max_slots; ++j) {
1130				if (!prev[j] || !(prev[j]->dst.write || prev[j]->is_op3))
1131					continue;
1132
1133				/* If it's relative then we can't determin which gpr is really used. */
1134				if (prev[j]->dst.chan == alu->src[src].chan &&
1135					(prev[j]->dst.sel == alu->src[src].sel ||
1136					prev[j]->dst.rel || alu->src[src].rel))
1137					return 0;
1138			}
1139		}
1140	}
1141
1142	/* more than one PRED_ or KILL_ ? */
1143	if (num_once_inst > 1)
1144		return 0;
1145
1146	/* check if the result can still be swizzlet */
1147	r = check_and_set_bank_swizzle(bc, result);
1148	if (r)
1149		return 0;
1150
1151	/* looks like everything worked out right, apply the changes */
1152
1153	/* undo adding previus literals */
1154	bc->cf_last->ndw -= align(prev_nliteral, 2);
1155
1156	/* sort instructions */
1157	for (i = 0; i < max_slots; ++i) {
1158		slots[i] = result[i];
1159		if (result[i]) {
1160			LIST_DEL(&result[i]->list);
1161			result[i]->last = 0;
1162			LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1163		}
1164	}
1165
1166	/* determine new last instruction */
1167	LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
1168
1169	/* determine new first instruction */
1170	for (i = 0; i < max_slots; ++i) {
1171		if (result[i]) {
1172			bc->cf_last->curr_bs_head = result[i];
1173			break;
1174		}
1175	}
1176
1177	bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1178	bc->cf_last->prev2_bs_head = NULL;
1179
1180	return 0;
1181}
1182
1183/* we'll keep kcache sets sorted by bank & addr */
1184static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
1185		struct r600_bytecode_kcache *kcache,
1186		unsigned bank, unsigned line)
1187{
1188	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
1189
1190	for (i = 0; i < kcache_banks; i++) {
1191		if (kcache[i].mode) {
1192			int d;
1193
1194			if (kcache[i].bank < bank)
1195				continue;
1196
1197			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
1198					kcache[i].bank > bank) {
1199				/* try to insert new line */
1200				if (kcache[kcache_banks-1].mode) {
1201					/* all sets are in use */
1202					return -ENOMEM;
1203				}
1204
1205				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
1206				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1207				kcache[i].bank = bank;
1208				kcache[i].addr = line;
1209				return 0;
1210			}
1211
1212			d = line - kcache[i].addr;
1213
1214			if (d == -1) {
1215				kcache[i].addr--;
1216				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
1217					/* we are prepending the line to the current set,
1218					 * discarding the existing second line,
1219					 * so we'll have to insert line+2 after it */
1220					line += 2;
1221					continue;
1222				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
1223					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1224					return 0;
1225				} else {
1226					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
1227					return -ENOMEM;
1228				}
1229			} else if (d == 1) {
1230				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1231				return 0;
1232			} else if (d == 0)
1233				return 0;
1234		} else { /* free kcache set - use it */
1235			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1236			kcache[i].bank = bank;
1237			kcache[i].addr = line;
1238			return 0;
1239		}
1240	}
1241	return -ENOMEM;
1242}
1243
1244static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
1245		struct r600_bytecode_kcache *kcache,
1246		struct r600_bytecode_alu *alu)
1247{
1248	int i, r;
1249
1250	for (i = 0; i < 3; i++) {
1251		unsigned bank, line, sel = alu->src[i].sel;
1252
1253		if (sel < 512)
1254			continue;
1255
1256		bank = alu->src[i].kc_bank;
1257		line = (sel-512)>>4;
1258
1259		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
1260			return r;
1261	}
1262	return 0;
1263}
1264
1265static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
1266		struct r600_bytecode_alu *alu,
1267		struct r600_bytecode_kcache * kcache)
1268{
1269	int i, j;
1270
1271	/* Alter the src operands to refer to the kcache. */
1272	for (i = 0; i < 3; ++i) {
1273		static const unsigned int base[] = {128, 160, 256, 288};
1274		unsigned int line, sel = alu->src[i].sel, found = 0;
1275
1276		if (sel < 512)
1277			continue;
1278
1279		sel -= 512;
1280		line = sel>>4;
1281
1282		for (j = 0; j < 4 && !found; ++j) {
1283			switch (kcache[j].mode) {
1284			case V_SQ_CF_KCACHE_NOP:
1285			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
1286				R600_ERR("unexpected kcache line mode\n");
1287				return -ENOMEM;
1288			default:
1289				if (kcache[j].bank == alu->src[i].kc_bank &&
1290						kcache[j].addr <= line &&
1291						line < kcache[j].addr + kcache[j].mode) {
1292					alu->src[i].sel = sel - (kcache[j].addr<<4);
1293					alu->src[i].sel += base[j];
1294					found=1;
1295			    }
1296			}
1297		}
1298	}
1299	return 0;
1300}
1301
1302static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
1303{
1304	struct r600_bytecode_kcache kcache_sets[4];
1305	struct r600_bytecode_kcache *kcache = kcache_sets;
1306	int r;
1307
1308	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
1309
1310	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1311		/* can't alloc, need to start new clause */
1312		if ((r = r600_bytecode_add_cf(bc))) {
1313			return r;
1314		}
1315		bc->cf_last->inst = type;
1316
1317		/* retry with the new clause */
1318		kcache = bc->cf_last->kcache;
1319		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1320			/* can't alloc again- should never happen */
1321			return r;
1322		}
1323	} else {
1324		/* update kcache sets */
1325		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
1326	}
1327
1328	/* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
1329	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
1330		if (bc->chip_class < EVERGREEN)
1331			return -ENOMEM;
1332		bc->cf_last->eg_alu_extended = 1;
1333	}
1334
1335	return 0;
1336}
1337
1338static int insert_nop_r6xx(struct r600_bytecode *bc)
1339{
1340	struct r600_bytecode_alu alu;
1341	int r, i;
1342
1343	for (i = 0; i < 4; i++) {
1344		memset(&alu, 0, sizeof(alu));
1345		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
1346		alu.src[0].chan = i;
1347		alu.dst.chan = i;
1348		alu.last = (i == 3);
1349		r = r600_bytecode_add_alu(bc, &alu);
1350		if (r)
1351			return r;
1352	}
1353	return 0;
1354}
1355
1356/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1357static int load_ar_r6xx(struct r600_bytecode *bc)
1358{
1359	struct r600_bytecode_alu alu;
1360	int r;
1361
1362	if (bc->ar_loaded)
1363		return 0;
1364
1365	/* hack to avoid making MOVA the last instruction in the clause */
1366	if ((bc->cf_last->ndw>>1) >= 110)
1367		bc->force_add_cf = 1;
1368
1369	memset(&alu, 0, sizeof(alu));
1370	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
1371	alu.src[0].sel = bc->ar_reg;
1372	alu.last = 1;
1373	alu.index_mode = INDEX_MODE_LOOP;
1374	r = r600_bytecode_add_alu(bc, &alu);
1375	if (r)
1376		return r;
1377
1378	/* no requirement to set uses waterfall on MOVA_GPR_INT */
1379	bc->ar_loaded = 1;
1380	return 0;
1381}
1382
1383/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1384static int load_ar(struct r600_bytecode *bc)
1385{
1386	struct r600_bytecode_alu alu;
1387	int r;
1388
1389	if (bc->ar_handling)
1390		return load_ar_r6xx(bc);
1391
1392	if (bc->ar_loaded)
1393		return 0;
1394
1395	/* hack to avoid making MOVA the last instruction in the clause */
1396	if ((bc->cf_last->ndw>>1) >= 110)
1397		bc->force_add_cf = 1;
1398
1399	memset(&alu, 0, sizeof(alu));
1400	alu.inst = BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
1401	alu.src[0].sel = bc->ar_reg;
1402	alu.last = 1;
1403	r = r600_bytecode_add_alu(bc, &alu);
1404	if (r)
1405		return r;
1406
1407	bc->cf_last->r6xx_uses_waterfall = 1;
1408	bc->ar_loaded = 1;
1409	return 0;
1410}
1411
1412int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, int type)
1413{
1414	struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1415	struct r600_bytecode_alu *lalu;
1416	int i, r;
1417
1418	if (nalu == NULL)
1419		return -ENOMEM;
1420	memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1421
1422	if (bc->cf_last != NULL && bc->cf_last->inst != type) {
1423		/* check if we could add it anyway */
1424		if (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU) &&
1425			type == BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE)) {
1426			LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1427				if (lalu->execute_mask) {
1428					bc->force_add_cf = 1;
1429					break;
1430				}
1431			}
1432		} else
1433			bc->force_add_cf = 1;
1434	}
1435
1436	/* cf can contains only alu or only vtx or only tex */
1437	if (bc->cf_last == NULL || bc->force_add_cf) {
1438		r = r600_bytecode_add_cf(bc);
1439		if (r) {
1440			free(nalu);
1441			return r;
1442		}
1443	}
1444	bc->cf_last->inst = type;
1445
1446	/* Check AR usage and load it if required */
1447	for (i = 0; i < 3; i++)
1448		if (nalu->src[i].rel && !bc->ar_loaded)
1449			load_ar(bc);
1450
1451	if (nalu->dst.rel && !bc->ar_loaded)
1452		load_ar(bc);
1453
1454	/* Setup the kcache for this ALU instruction. This will start a new
1455	 * ALU clause if needed. */
1456	if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1457		free(nalu);
1458		return r;
1459	}
1460
1461	if (!bc->cf_last->curr_bs_head) {
1462		bc->cf_last->curr_bs_head = nalu;
1463	}
1464	/* number of gpr == the last gpr used in any alu */
1465	for (i = 0; i < 3; i++) {
1466		if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1467			bc->ngpr = nalu->src[i].sel + 1;
1468		}
1469		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1470			r600_bytecode_special_constants(nalu->src[i].value,
1471				&nalu->src[i].sel, &nalu->src[i].neg);
1472	}
1473	if (nalu->dst.sel >= bc->ngpr) {
1474		bc->ngpr = nalu->dst.sel + 1;
1475	}
1476	LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1477	/* each alu use 2 dwords */
1478	bc->cf_last->ndw += 2;
1479	bc->ndw += 2;
1480
1481	/* process cur ALU instructions for bank swizzle */
1482	if (nalu->last) {
1483		uint32_t literal[4];
1484		unsigned nliteral;
1485		struct r600_bytecode_alu *slots[5];
1486		int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1487		r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1488		if (r)
1489			return r;
1490
1491		if (bc->cf_last->prev_bs_head) {
1492			r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1493			if (r)
1494				return r;
1495		}
1496
1497		if (bc->cf_last->prev_bs_head) {
1498			r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1499			if (r)
1500				return r;
1501		}
1502
1503		r = check_and_set_bank_swizzle(bc, slots);
1504		if (r)
1505			return r;
1506
1507		for (i = 0, nliteral = 0; i < max_slots; i++) {
1508			if (slots[i]) {
1509				r = r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral);
1510				if (r)
1511					return r;
1512			}
1513		}
1514		bc->cf_last->ndw += align(nliteral, 2);
1515
1516		/* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1517		 * worst case */
1518		if ((bc->cf_last->ndw >> 1) >= 120) {
1519			bc->force_add_cf = 1;
1520		}
1521
1522		bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1523		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1524		bc->cf_last->curr_bs_head = NULL;
1525	}
1526
1527	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
1528		insert_nop_r6xx(bc);
1529
1530	return 0;
1531}
1532
1533int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1534{
1535	return r600_bytecode_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1536}
1537
1538static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1539{
1540	switch (bc->chip_class) {
1541	case R600:
1542		return 8;
1543
1544	case R700:
1545	case EVERGREEN:
1546	case CAYMAN:
1547		return 16;
1548
1549	default:
1550		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1551		return 8;
1552	}
1553}
1554
1555static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
1556{
1557	switch (bc->chip_class) {
1558	case R700:
1559	case R600:
1560		return bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1561		       bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC;
1562	case EVERGREEN:
1563		return bc->cf_last->inst != EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1564	case CAYMAN:
1565		return bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1566	default:
1567		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1568		return FALSE;
1569	}
1570}
1571
1572int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1573{
1574	struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1575	int r;
1576
1577	if (nvtx == NULL)
1578		return -ENOMEM;
1579	memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1580
1581	/* cf can contains only alu or only vtx or only tex */
1582	if (bc->cf_last == NULL ||
1583	    last_inst_was_not_vtx_fetch(bc) ||
1584	    bc->force_add_cf) {
1585		r = r600_bytecode_add_cf(bc);
1586		if (r) {
1587			free(nvtx);
1588			return r;
1589		}
1590		switch (bc->chip_class) {
1591		case R600:
1592		case R700:
1593			bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1594			break;
1595		case EVERGREEN:
1596			bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1597			break;
1598		case CAYMAN:
1599			bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1600			break;
1601		default:
1602			R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1603			return -EINVAL;
1604		}
1605	}
1606	LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1607	/* each fetch use 4 dwords */
1608	bc->cf_last->ndw += 4;
1609	bc->ndw += 4;
1610	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1611		bc->force_add_cf = 1;
1612
1613	bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
1614	bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
1615
1616	return 0;
1617}
1618
1619int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1620{
1621	struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1622	int r;
1623
1624	if (ntex == NULL)
1625		return -ENOMEM;
1626	memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1627
1628	/* we can't fetch data und use it as texture lookup address in the same TEX clause */
1629	if (bc->cf_last != NULL &&
1630		bc->cf_last->inst == BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX)) {
1631		struct r600_bytecode_tex *ttex;
1632		LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1633			if (ttex->dst_gpr == ntex->src_gpr) {
1634				bc->force_add_cf = 1;
1635				break;
1636			}
1637		}
1638		/* slight hack to make gradients always go into same cf */
1639		if (ntex->inst == SQ_TEX_INST_SET_GRADIENTS_H)
1640			bc->force_add_cf = 1;
1641	}
1642
1643	/* cf can contains only alu or only vtx or only tex */
1644	if (bc->cf_last == NULL ||
1645		bc->cf_last->inst != BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX) ||
1646	        bc->force_add_cf) {
1647		r = r600_bytecode_add_cf(bc);
1648		if (r) {
1649			free(ntex);
1650			return r;
1651		}
1652		bc->cf_last->inst = BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX);
1653	}
1654	if (ntex->src_gpr >= bc->ngpr) {
1655		bc->ngpr = ntex->src_gpr + 1;
1656	}
1657	if (ntex->dst_gpr >= bc->ngpr) {
1658		bc->ngpr = ntex->dst_gpr + 1;
1659	}
1660	LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1661	/* each texture fetch use 4 dwords */
1662	bc->cf_last->ndw += 4;
1663	bc->ndw += 4;
1664	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1665		bc->force_add_cf = 1;
1666	return 0;
1667}
1668
1669int r600_bytecode_add_cfinst(struct r600_bytecode *bc, int inst)
1670{
1671	int r;
1672	r = r600_bytecode_add_cf(bc);
1673	if (r)
1674		return r;
1675
1676	bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1677	bc->cf_last->inst = inst;
1678	return 0;
1679}
1680
1681int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1682{
1683	return r600_bytecode_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
1684}
1685
1686/* common to all 3 families */
1687static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1688{
1689	bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1690			S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1691			S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1692			S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1693	if (bc->chip_class < CAYMAN)
1694		bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1695	id++;
1696	bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1697				S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1698				S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1699				S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1700				S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1701				S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1702				S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1703				S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1704				S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1705				S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1706	bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1707				S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1708	if (bc->chip_class < CAYMAN)
1709		bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1710	id++;
1711	bc->bytecode[id++] = 0;
1712	return 0;
1713}
1714
1715/* common to all 3 families */
1716static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1717{
1718	bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1719				S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1720				S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1721				S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1722	bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1723				S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1724				S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1725				S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1726				S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1727				S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1728				S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1729				S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1730				S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1731				S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1732				S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1733	bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1734				S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1735				S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1736				S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1737				S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1738				S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1739				S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1740				S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1741	bc->bytecode[id++] = 0;
1742	return 0;
1743}
1744
1745/* r600 only, r700/eg bits in r700_asm.c */
1746static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1747{
1748	/* don't replace gpr by pv or ps for destination register */
1749	bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1750				S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1751				S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1752				S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1753				S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1754				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1755				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1756				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1757				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
1758				S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
1759				S_SQ_ALU_WORD0_LAST(alu->last);
1760
1761	if (alu->is_op3) {
1762		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1763					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1764					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1765					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1766					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1767					S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1768					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1769					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1770					S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1771					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1772	} else {
1773		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1774					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1775					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1776					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1777					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1778					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1779					S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1780					S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1781					S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1782					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1783					S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
1784					S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
1785	}
1786	return 0;
1787}
1788
1789static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1790{
1791	*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1792	*bytecode++ = cf->inst |
1793			S_SQ_CF_WORD1_BARRIER(1) |
1794			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1795}
1796
1797/* common for r600/r700 - eg in eg_asm.c */
1798static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1799{
1800	unsigned id = cf->id;
1801
1802	switch (cf->inst) {
1803	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1804	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
1805	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
1806	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
1807		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1808			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1809			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1810			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1811
1812		bc->bytecode[id++] = cf->inst |
1813			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1814			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1815			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1816					S_SQ_CF_ALU_WORD1_BARRIER(1) |
1817					S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1818					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1819		break;
1820	case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1821	case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1822	case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1823		if (bc->chip_class == R700)
1824			r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1825		else
1826			r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1827		break;
1828	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1829	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1830		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1831			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1832			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1833			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1834		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1835			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1836			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1837			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1838			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1839			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1840			cf->output.inst |
1841			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1842		break;
1843	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
1844	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
1845	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
1846	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
1847		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1848			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1849			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1850			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1851		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1852			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1853			cf->output.inst |
1854			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
1855			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
1856			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
1857		break;
1858	case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1859	case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1860	case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1861	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1862	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1863	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1864	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1865	case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1866	case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1867		bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1868		bc->bytecode[id++] = cf->inst |
1869					S_SQ_CF_WORD1_BARRIER(1) |
1870			                S_SQ_CF_WORD1_COND(cf->cond) |
1871			                S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1872
1873		break;
1874	default:
1875		R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1876		return -EINVAL;
1877	}
1878	return 0;
1879}
1880
1881int r600_bytecode_build(struct r600_bytecode *bc)
1882{
1883	struct r600_bytecode_cf *cf;
1884	struct r600_bytecode_alu *alu;
1885	struct r600_bytecode_vtx *vtx;
1886	struct r600_bytecode_tex *tex;
1887	uint32_t literal[4];
1888	unsigned nliteral;
1889	unsigned addr;
1890	int i, r;
1891
1892	if (bc->callstack[0].max > 0)
1893		bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1894	if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1895		bc->nstack = 1;
1896	}
1897
1898	/* first path compute addr of each CF block */
1899	/* addr start after all the CF instructions */
1900	addr = bc->cf_last->id + 2;
1901	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1902		if (bc->chip_class >= EVERGREEN) {
1903			switch (cf->inst) {
1904			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1905			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1906				/* fetch node need to be 16 bytes aligned*/
1907				addr += 3;
1908				addr &= 0xFFFFFFFCUL;
1909				break;
1910			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1911			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
1912			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
1913			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
1914			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1915			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1916			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
1917			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
1918			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
1919			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
1920			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
1921			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
1922			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
1923			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
1924			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
1925			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
1926			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
1927			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
1928			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
1929			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
1930			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
1931			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
1932			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1933			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1934			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
1935			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1936			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1937			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1938			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1939			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1940			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1941			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1942			case CF_NATIVE:
1943				break;
1944			default:
1945				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1946				return -EINVAL;
1947			}
1948		} else {
1949			switch (cf->inst) {
1950			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1951			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1952			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1953				/* fetch node need to be 16 bytes aligned*/
1954				addr += 3;
1955				addr &= 0xFFFFFFFCUL;
1956				break;
1957			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1958			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
1959			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
1960			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
1961			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1962			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1963			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
1964			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
1965			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
1966			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
1967			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1968			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1969			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1970			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1971			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1972			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1973			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1974			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1975			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1976				break;
1977			default:
1978				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1979				return -EINVAL;
1980			}
1981		}
1982		cf->addr = addr;
1983		addr += cf->ndw;
1984		bc->ndw = cf->addr + cf->ndw;
1985	}
1986	free(bc->bytecode);
1987	bc->bytecode = calloc(1, bc->ndw * 4);
1988	if (bc->bytecode == NULL)
1989		return -ENOMEM;
1990	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1991		addr = cf->addr;
1992		if (bc->chip_class >= EVERGREEN) {
1993			r = eg_bytecode_cf_build(bc, cf);
1994			if (r)
1995				return r;
1996
1997			switch (cf->inst) {
1998			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1999			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2000			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2001			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2002				nliteral = 0;
2003				memset(literal, 0, sizeof(literal));
2004				LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2005					r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2006					if (r)
2007						return r;
2008					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
2009					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
2010
2011					switch(bc->chip_class) {
2012					case EVERGREEN: /* eg alu is same encoding as r700 */
2013					case CAYMAN:
2014						r = r700_bytecode_alu_build(bc, alu, addr);
2015						break;
2016					default:
2017						R600_ERR("unknown chip class %d.\n", bc->chip_class);
2018						return -EINVAL;
2019					}
2020					if (r)
2021						return r;
2022					addr += 2;
2023					if (alu->last) {
2024						for (i = 0; i < align(nliteral, 2); ++i) {
2025							bc->bytecode[addr++] = literal[i];
2026						}
2027						nliteral = 0;
2028						memset(literal, 0, sizeof(literal));
2029					}
2030				}
2031				break;
2032			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2033				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2034					r = r600_bytecode_vtx_build(bc, vtx, addr);
2035					if (r)
2036						return r;
2037					addr += 4;
2038				}
2039				break;
2040			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2041				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2042					assert(bc->chip_class >= EVERGREEN);
2043					r = r600_bytecode_vtx_build(bc, vtx, addr);
2044					if (r)
2045						return r;
2046					addr += 4;
2047				}
2048				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2049					r = r600_bytecode_tex_build(bc, tex, addr);
2050					if (r)
2051						return r;
2052					addr += 4;
2053				}
2054				break;
2055			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2056			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2057			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
2058			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
2059			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
2060			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
2061			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
2062			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
2063			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
2064			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
2065			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
2066			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
2067			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
2068			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
2069			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
2070			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
2071			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
2072			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
2073			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2074			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2075			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2076			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2077			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2078			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2079			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
2080			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2081			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2082			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
2083				break;
2084			case CF_NATIVE:
2085				break;
2086			default:
2087				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2088				return -EINVAL;
2089			}
2090		} else {
2091			r = r600_bytecode_cf_build(bc, cf);
2092			if (r)
2093				return r;
2094
2095			switch (cf->inst) {
2096			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2097			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2098			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2099			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2100				nliteral = 0;
2101				memset(literal, 0, sizeof(literal));
2102				LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2103					r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2104					if (r)
2105						return r;
2106					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
2107					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
2108
2109					switch(bc->chip_class) {
2110					case R600:
2111						r = r600_bytecode_alu_build(bc, alu, addr);
2112						break;
2113					case R700:
2114						r = r700_bytecode_alu_build(bc, alu, addr);
2115						break;
2116					default:
2117						R600_ERR("unknown chip class %d.\n", bc->chip_class);
2118						return -EINVAL;
2119					}
2120					if (r)
2121						return r;
2122					addr += 2;
2123					if (alu->last) {
2124						for (i = 0; i < align(nliteral, 2); ++i) {
2125							bc->bytecode[addr++] = literal[i];
2126						}
2127						nliteral = 0;
2128						memset(literal, 0, sizeof(literal));
2129					}
2130				}
2131				break;
2132			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2133			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
2134				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2135					r = r600_bytecode_vtx_build(bc, vtx, addr);
2136					if (r)
2137						return r;
2138					addr += 4;
2139				}
2140				break;
2141			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2142				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2143					r = r600_bytecode_tex_build(bc, tex, addr);
2144					if (r)
2145						return r;
2146					addr += 4;
2147				}
2148				break;
2149			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2150			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2151			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
2152			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
2153			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
2154			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
2155			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2156			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2157			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2158			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2159			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2160			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2161			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
2162			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2163			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2164				break;
2165			default:
2166				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2167				return -EINVAL;
2168			}
2169		}
2170	}
2171	return 0;
2172}
2173
2174void r600_bytecode_clear(struct r600_bytecode *bc)
2175{
2176	struct r600_bytecode_cf *cf = NULL, *next_cf;
2177
2178	free(bc->bytecode);
2179	bc->bytecode = NULL;
2180
2181	LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
2182		struct r600_bytecode_alu *alu = NULL, *next_alu;
2183		struct r600_bytecode_tex *tex = NULL, *next_tex;
2184		struct r600_bytecode_tex *vtx = NULL, *next_vtx;
2185
2186		LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
2187			free(alu);
2188		}
2189
2190		LIST_INITHEAD(&cf->alu);
2191
2192		LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
2193			free(tex);
2194		}
2195
2196		LIST_INITHEAD(&cf->tex);
2197
2198		LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
2199			free(vtx);
2200		}
2201
2202		LIST_INITHEAD(&cf->vtx);
2203
2204		free(cf);
2205	}
2206
2207	LIST_INITHEAD(&cf->list);
2208}
2209
2210void r600_bytecode_dump(struct r600_bytecode *bc)
2211{
2212	struct r600_bytecode_cf *cf = NULL;
2213	struct r600_bytecode_alu *alu = NULL;
2214	struct r600_bytecode_vtx *vtx = NULL;
2215	struct r600_bytecode_tex *tex = NULL;
2216
2217	unsigned i, id;
2218	uint32_t literal[4];
2219	unsigned nliteral;
2220	char chip = '6';
2221
2222	switch (bc->chip_class) {
2223	case R700:
2224		chip = '7';
2225		break;
2226	case EVERGREEN:
2227		chip = 'E';
2228		break;
2229	case CAYMAN:
2230		chip = 'C';
2231		break;
2232	case R600:
2233	default:
2234		chip = '6';
2235		break;
2236	}
2237	fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
2238	fprintf(stderr, "     %c\n", chip);
2239
2240	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2241		id = cf->id;
2242
2243		if (bc->chip_class >= EVERGREEN) {
2244			switch (cf->inst) {
2245			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2246			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2247			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2248			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2249				if (cf->eg_alu_extended) {
2250					fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
2251					fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
2252					fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
2253					fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
2254					id++;
2255					fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
2256					fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
2257					fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
2258					fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
2259					id++;
2260				}
2261
2262				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2263				fprintf(stderr, "ADDR:%d ", cf->addr);
2264				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2265				fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2266				fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2267				id++;
2268				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2269				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_ALU_WORD1_CF_INST(cf->inst));
2270				fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2271				fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2272				fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2273				fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2274				break;
2275			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2276			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2277				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2278				fprintf(stderr, "ADDR:%d\n", cf->addr);
2279				id++;
2280				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2281				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_WORD1_CF_INST(cf->inst));
2282				fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2283				break;
2284			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2285			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2286				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2287				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2288				fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2289				fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2290				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2291				id++;
2292				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2293				fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2294				fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2295				fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2296				fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2297				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2298				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst));
2299				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2300				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2301				break;
2302			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
2303			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
2304			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
2305			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
2306			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
2307			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
2308			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
2309			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
2310			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
2311			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
2312			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
2313			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
2314			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
2315			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
2316			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
2317			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
2318				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
2319					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2320					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
2321					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2322					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
2323				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2324				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
2325				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
2326				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2327				id++;
2328				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
2329					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2330					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
2331					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2332					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
2333				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
2334				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
2335				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2336				fprintf(stderr, "INST:%d ", cf->output.inst);
2337				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2338				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2339				break;
2340			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2341			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2342			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
2343			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2344			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2345			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2346			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2347			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2348			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2349			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
2350				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2351				fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
2352				id++;
2353				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2354				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_WORD1_CF_INST(cf->inst));
2355				fprintf(stderr, "COND:%X ", cf->cond);
2356				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2357				break;
2358			case CF_NATIVE:
2359				fprintf(stderr, "%04d %08X CF NATIVE\n", id, bc->bytecode[id]);
2360				fprintf(stderr, "%04d %08X CF NATIVE\n", id + 1, bc->bytecode[id + 1]);
2361				break;
2362			default:
2363				R600_ERR("Unknown instruction %0x\n", cf->inst);
2364			}
2365		} else {
2366			switch (cf->inst) {
2367			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2368			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2369			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2370			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2371				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2372				fprintf(stderr, "ADDR:%d ", cf->addr);
2373				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2374				fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2375				fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2376				id++;
2377				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2378				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_ALU_WORD1_CF_INST(cf->inst));
2379				fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2380				fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2381				fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2382				fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2383				break;
2384			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2385			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2386			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
2387				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2388				fprintf(stderr, "ADDR:%d\n", cf->addr);
2389				id++;
2390				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2391				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_WORD1_CF_INST(cf->inst));
2392				fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2393				break;
2394			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2395			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2396				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2397				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2398				fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2399				fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2400				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2401				id++;
2402				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2403				fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2404				fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2405				fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2406				fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2407				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2408				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst));
2409				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2410				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2411				break;
2412			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
2413			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
2414			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
2415			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
2416				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
2417					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2418					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
2419				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2420				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
2421				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
2422				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2423				id++;
2424				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
2425					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2426					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
2427				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
2428				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
2429				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2430				fprintf(stderr, "INST:%d ", cf->output.inst);
2431				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2432				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2433				break;
2434			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2435			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2436			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
2437			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2438			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2439			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2440			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2441			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2442			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2443				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2444				fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
2445				id++;
2446				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2447				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_WORD1_CF_INST(cf->inst));
2448				fprintf(stderr, "COND:%X ", cf->cond);
2449				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2450				break;
2451			default:
2452				R600_ERR("Unknown instruction %0x\n", cf->inst);
2453			}
2454		}
2455
2456		id = cf->addr;
2457		nliteral = 0;
2458		LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2459			r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2460
2461			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2462			fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
2463			fprintf(stderr, "REL:%d ", alu->src[0].rel);
2464			fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
2465			fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
2466			fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
2467			fprintf(stderr, "REL:%d ", alu->src[1].rel);
2468			fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
2469			fprintf(stderr, "NEG:%d ", alu->src[1].neg);
2470			fprintf(stderr, "IM:%d) ", alu->index_mode);
2471			fprintf(stderr, "PRED_SEL:%d ", alu->pred_sel);
2472			fprintf(stderr, "LAST:%d)\n", alu->last);
2473			id++;
2474			fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
2475			fprintf(stderr, "INST:0x%x ", alu->inst);
2476			fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
2477			fprintf(stderr, "CHAN:%d ", alu->dst.chan);
2478			fprintf(stderr, "REL:%d ", alu->dst.rel);
2479			fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
2480			fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
2481			if (alu->is_op3) {
2482				fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
2483				fprintf(stderr, "REL:%d ", alu->src[2].rel);
2484				fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
2485				fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
2486			} else {
2487				fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
2488				fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
2489				fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
2490				fprintf(stderr, "OMOD:%d ", alu->omod);
2491				fprintf(stderr, "EXECUTE_MASK:%d ", alu->execute_mask);
2492				fprintf(stderr, "UPDATE_PRED:%d\n", alu->update_pred);
2493			}
2494
2495			id++;
2496			if (alu->last) {
2497				for (i = 0; i < nliteral; i++, id++) {
2498					float *f = (float*)(bc->bytecode + id);
2499					fprintf(stderr, "%04d %08X\t%f (%d)\n", id, bc->bytecode[id], *f,
2500							*(bc->bytecode + id));
2501				}
2502				id += nliteral & 1;
2503				nliteral = 0;
2504			}
2505		}
2506
2507		LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2508			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2509			fprintf(stderr, "INST:0x%x ", tex->inst);
2510			fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
2511			fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
2512			fprintf(stderr, "REL:%d)\n", tex->src_rel);
2513			id++;
2514			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2515			fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
2516			fprintf(stderr, "REL:%d ", tex->dst_rel);
2517			fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
2518			fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
2519			fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
2520			fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
2521			fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
2522			fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
2523			fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
2524			fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
2525			fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
2526			id++;
2527			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2528			fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
2529			fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
2530			fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
2531			fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
2532			fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
2533			fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
2534			fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
2535			fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
2536			id++;
2537			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
2538			id++;
2539		}
2540
2541		LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2542			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2543			fprintf(stderr, "INST:%d ", vtx->inst);
2544			fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
2545			fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
2546			id++;
2547			/* This assumes that no semantic fetches exist */
2548			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2549			fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
2550			fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
2551			if (bc->chip_class < CAYMAN)
2552				fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
2553			else
2554				fprintf(stderr, "SEL_Y:%d) ", 0);
2555			fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
2556			fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
2557			fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
2558			fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
2559			fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
2560			fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
2561			fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
2562			fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2563			fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2564			fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2565			id++;
2566			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2567			fprintf(stderr, "ENDIAN:%d ", vtx->endian);
2568			fprintf(stderr, "OFFSET:%d\n", vtx->offset);
2569			/* XXX */
2570			id++;
2571			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
2572			id++;
2573		}
2574	}
2575
2576	fprintf(stderr, "--------------------------------------\n");
2577}
2578
2579static void r600_vertex_data_type(enum pipe_format pformat,
2580				  unsigned *format,
2581				  unsigned *num_format, unsigned *format_comp, unsigned *endian)
2582{
2583	const struct util_format_description *desc;
2584	unsigned i;
2585
2586	*format = 0;
2587	*num_format = 0;
2588	*format_comp = 0;
2589	*endian = ENDIAN_NONE;
2590
2591	desc = util_format_description(pformat);
2592	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2593		goto out_unknown;
2594	}
2595
2596	/* Find the first non-VOID channel. */
2597	for (i = 0; i < 4; i++) {
2598		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2599			break;
2600		}
2601	}
2602
2603	*endian = r600_endian_swap(desc->channel[i].size);
2604
2605	switch (desc->channel[i].type) {
2606	/* Half-floats, floats, ints */
2607	case UTIL_FORMAT_TYPE_FLOAT:
2608		switch (desc->channel[i].size) {
2609		case 16:
2610			switch (desc->nr_channels) {
2611			case 1:
2612				*format = FMT_16_FLOAT;
2613				break;
2614			case 2:
2615				*format = FMT_16_16_FLOAT;
2616				break;
2617			case 3:
2618			case 4:
2619				*format = FMT_16_16_16_16_FLOAT;
2620				break;
2621			}
2622			break;
2623		case 32:
2624			switch (desc->nr_channels) {
2625			case 1:
2626				*format = FMT_32_FLOAT;
2627				break;
2628			case 2:
2629				*format = FMT_32_32_FLOAT;
2630				break;
2631			case 3:
2632				*format = FMT_32_32_32_FLOAT;
2633				break;
2634			case 4:
2635				*format = FMT_32_32_32_32_FLOAT;
2636				break;
2637			}
2638			break;
2639		default:
2640			goto out_unknown;
2641		}
2642		break;
2643		/* Unsigned ints */
2644	case UTIL_FORMAT_TYPE_UNSIGNED:
2645		/* Signed ints */
2646	case UTIL_FORMAT_TYPE_SIGNED:
2647		switch (desc->channel[i].size) {
2648		case 8:
2649			switch (desc->nr_channels) {
2650			case 1:
2651				*format = FMT_8;
2652				break;
2653			case 2:
2654				*format = FMT_8_8;
2655				break;
2656			case 3:
2657			case 4:
2658				*format = FMT_8_8_8_8;
2659				break;
2660			}
2661			break;
2662		case 10:
2663			if (desc->nr_channels != 4)
2664				goto out_unknown;
2665
2666			*format = FMT_2_10_10_10;
2667			break;
2668		case 16:
2669			switch (desc->nr_channels) {
2670			case 1:
2671				*format = FMT_16;
2672				break;
2673			case 2:
2674				*format = FMT_16_16;
2675				break;
2676			case 3:
2677			case 4:
2678				*format = FMT_16_16_16_16;
2679				break;
2680			}
2681			break;
2682		case 32:
2683			switch (desc->nr_channels) {
2684			case 1:
2685				*format = FMT_32;
2686				break;
2687			case 2:
2688				*format = FMT_32_32;
2689				break;
2690			case 3:
2691				*format = FMT_32_32_32;
2692				break;
2693			case 4:
2694				*format = FMT_32_32_32_32;
2695				break;
2696			}
2697			break;
2698		default:
2699			goto out_unknown;
2700		}
2701		break;
2702	default:
2703		goto out_unknown;
2704	}
2705
2706	if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2707		*format_comp = 1;
2708	}
2709
2710	*num_format = 0;
2711	if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2712	    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2713		if (!desc->channel[i].normalized) {
2714			if (desc->channel[i].pure_integer)
2715				*num_format = 1;
2716			else
2717				*num_format = 2;
2718		}
2719	}
2720	return;
2721out_unknown:
2722	R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2723}
2724
2725int r600_vertex_elements_build_fetch_shader(struct r600_context *rctx, struct r600_vertex_element *ve)
2726{
2727	static int dump_shaders = -1;
2728
2729	struct r600_bytecode bc;
2730	struct r600_bytecode_vtx vtx;
2731	struct pipe_vertex_element *elements = ve->elements;
2732	const struct util_format_description *desc;
2733	unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
2734	unsigned format, num_format, format_comp, endian;
2735	uint32_t *bytecode;
2736	int i, r;
2737
2738	memset(&bc, 0, sizeof(bc));
2739	r600_bytecode_init(&bc, rctx->chip_class, rctx->family);
2740
2741	for (i = 0; i < ve->count; i++) {
2742		if (elements[i].instance_divisor > 1) {
2743			struct r600_bytecode_alu alu;
2744
2745			memset(&alu, 0, sizeof(alu));
2746			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2747			alu.src[0].sel = 0;
2748			alu.src[0].chan = 3;
2749
2750			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2751			alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2752
2753			alu.dst.sel = i + 1;
2754			alu.dst.chan = 3;
2755			alu.dst.write = 1;
2756			alu.last = 1;
2757
2758			if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2759				r600_bytecode_clear(&bc);
2760				return r;
2761			}
2762		}
2763	}
2764
2765	for (i = 0; i < ve->count; i++) {
2766		r600_vertex_data_type(ve->elements[i].src_format,
2767				      &format, &num_format, &format_comp, &endian);
2768
2769		desc = util_format_description(ve->elements[i].src_format);
2770		if (desc == NULL) {
2771			r600_bytecode_clear(&bc);
2772			R600_ERR("unknown format %d\n", ve->elements[i].src_format);
2773			return -EINVAL;
2774		}
2775
2776		if (elements[i].src_offset > 65535) {
2777			r600_bytecode_clear(&bc);
2778			R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
2779			return -EINVAL;
2780		}
2781
2782		memset(&vtx, 0, sizeof(vtx));
2783		vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
2784		vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2785		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2786		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2787		vtx.mega_fetch_count = 0x1F;
2788		vtx.dst_gpr = i + 1;
2789		vtx.dst_sel_x = desc->swizzle[0];
2790		vtx.dst_sel_y = desc->swizzle[1];
2791		vtx.dst_sel_z = desc->swizzle[2];
2792		vtx.dst_sel_w = desc->swizzle[3];
2793		vtx.data_format = format;
2794		vtx.num_format_all = num_format;
2795		vtx.format_comp_all = format_comp;
2796		vtx.srf_mode_all = 1;
2797		vtx.offset = elements[i].src_offset;
2798		vtx.endian = endian;
2799
2800		if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
2801			r600_bytecode_clear(&bc);
2802			return r;
2803		}
2804	}
2805
2806	r600_bytecode_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
2807
2808	if ((r = r600_bytecode_build(&bc))) {
2809		r600_bytecode_clear(&bc);
2810		return r;
2811	}
2812
2813	if (dump_shaders == -1)
2814		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
2815
2816	if (dump_shaders) {
2817		fprintf(stderr, "--------------------------------------------------------------\n");
2818		r600_bytecode_dump(&bc);
2819		fprintf(stderr, "______________________________________________________________\n");
2820	}
2821
2822	ve->fs_size = bc.ndw*4;
2823
2824	ve->fetch_shader = (struct r600_resource*)
2825			pipe_buffer_create(rctx->context.screen,
2826					   PIPE_BIND_CUSTOM,
2827					   PIPE_USAGE_IMMUTABLE, ve->fs_size);
2828	if (ve->fetch_shader == NULL) {
2829		r600_bytecode_clear(&bc);
2830		return -ENOMEM;
2831	}
2832
2833	bytecode = rctx->ws->buffer_map(ve->fetch_shader->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
2834	if (bytecode == NULL) {
2835		r600_bytecode_clear(&bc);
2836		pipe_resource_reference((struct pipe_resource**)&ve->fetch_shader, NULL);
2837		return -ENOMEM;
2838	}
2839
2840	if (R600_BIG_ENDIAN) {
2841		for (i = 0; i < ve->fs_size / 4; ++i) {
2842			bytecode[i] = bswap_32(bc.bytecode[i]);
2843		}
2844	} else {
2845		memcpy(bytecode, bc.bytecode, ve->fs_size);
2846	}
2847
2848	rctx->ws->buffer_unmap(ve->fetch_shader->cs_buf);
2849	r600_bytecode_clear(&bc);
2850
2851	if (rctx->chip_class >= EVERGREEN)
2852		evergreen_fetch_shader(&rctx->context, ve);
2853	else
2854		r600_fetch_shader(&rctx->context, ve);
2855
2856	return 0;
2857}
2858