r600_asm.c revision 56227f875bdff6ef4fd53b09ba267c786ae9dac2
13aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev/*
23aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
33aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
43aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Permission is hereby granted, free of charge, to any person obtaining a
53aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * copy of this software and associated documentation files (the "Software"),
63aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * to deal in the Software without restriction, including without limitation
73aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * on the rights to use, copy, modify, merge, publish, distribute, sub
83aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * license, and/or sell copies of the Software, and to permit persons to whom
93aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * the Software is furnished to do so, subject to the following conditions:
103aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
113aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * The above copyright notice and this permission notice (including the next
123aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * paragraph) shall be included in all copies or substantial portions of the
133aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * Software.
143aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev *
153aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
163aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
173aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
183aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
193aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
203aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
213aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev * USE OR OTHER DEALINGS IN THE SOFTWARE.
223aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev */
233aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "r600_sq.h"
243aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "r600_opcodes.h"
253aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "r600_formats.h"
263aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "r600d.h"
273aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
283aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <errno.h>
293aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include <byteswap.h>
303aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "util/u_memory.h"
313aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#include "pipe/p_shader_tokens.h"
323aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
333aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#define NUM_OF_CYCLES 3
343aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev#define NUM_OF_COMPONENTS 4
353aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
363aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishevstatic inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
373aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev{
383aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev	if(alu->is_op3)
393aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		return 3;
403aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
413aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev	switch (bc->chip_class) {
423aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev	case R600:
433aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev	case R700:
443aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		switch (alu->inst) {
453aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
463aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev			return 0;
473aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
483aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
493aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
503aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
513aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
523aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
533aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
543aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
553aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
563aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
573aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE:
583aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
593aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
603aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
613aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
623aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
633aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
643aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
653aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
663aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
673aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
683aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
693aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
703aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
713aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
723aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
733aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
743aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
753aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
763aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
773aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
783aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
793aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
803aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
813aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
823aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
833aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
843aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
853aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
863aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
873aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
883aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT:
893aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT:
903aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT:
913aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev			return 2;
923aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev
933aa430dc5437a98734b36f996f9b17081a589143Yavor Goulishev		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
94		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
95		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
96		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
97		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
98		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
99		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
100		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
101		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
102		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
103		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
104		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
105		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
106		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
107		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
108		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
109		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
110		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
111		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
112		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
113		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT:
114		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT:
115		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
116		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
117		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
118		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
119			return 1;
120		default: R600_ERR(
121			"Need instruction operand number for 0x%x.\n", alu->inst);
122		}
123		break;
124	case EVERGREEN:
125	case CAYMAN:
126		switch (alu->inst) {
127		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
128			return 0;
129		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
130		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
131		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
132		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
133		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
134		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
135		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
136		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
137		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
138		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
139		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE:
140		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
141		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
142		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
143		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
144		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
145		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
146		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
147		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
148		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
149		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
150		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
151		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
152		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
153		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
154		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
155		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
156		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
157		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
158		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
159		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
160		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
161		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
162		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
163		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
164		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
165		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
166		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
167		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
168		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
169		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY:
170		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW:
171		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
172		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT:
173		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT:
174		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT:
175			return 2;
176
177		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
178		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
179		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
180		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
181		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
182		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
183		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
184		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
185		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
186		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
187		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
188		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
189		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
190		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
191		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
192		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
193		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT:
194		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT:
195		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
196		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
197		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
198		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
199		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0:
200		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
201		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
202			return 1;
203		default: R600_ERR(
204			"Need instruction operand number for 0x%x.\n", alu->inst);
205		}
206		break;
207	}
208
209	return 3;
210}
211
212int r700_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id);
213
214static struct r600_bytecode_cf *r600_bytecode_cf(void)
215{
216	struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
217
218	if (cf == NULL)
219		return NULL;
220	LIST_INITHEAD(&cf->list);
221	LIST_INITHEAD(&cf->alu);
222	LIST_INITHEAD(&cf->vtx);
223	LIST_INITHEAD(&cf->tex);
224	return cf;
225}
226
227static struct r600_bytecode_alu *r600_bytecode_alu(void)
228{
229	struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
230
231	if (alu == NULL)
232		return NULL;
233	LIST_INITHEAD(&alu->list);
234	return alu;
235}
236
237static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
238{
239	struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
240
241	if (vtx == NULL)
242		return NULL;
243	LIST_INITHEAD(&vtx->list);
244	return vtx;
245}
246
247static struct r600_bytecode_tex *r600_bytecode_tex(void)
248{
249	struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
250
251	if (tex == NULL)
252		return NULL;
253	LIST_INITHEAD(&tex->list);
254	return tex;
255}
256
257void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
258{
259	if ((chip_class == R600) && (family != CHIP_RV670))
260		bc->ar_handling = AR_HANDLE_RV6XX;
261	else
262		bc->ar_handling = AR_HANDLE_NORMAL;
263
264	if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 &&
265					   family != CHIP_RS880))
266		bc->r6xx_nop_after_rel_dst = 1;
267	else
268		bc->r6xx_nop_after_rel_dst = 0;
269	LIST_INITHEAD(&bc->cf);
270	bc->chip_class = chip_class;
271}
272
273static int r600_bytecode_add_cf(struct r600_bytecode *bc)
274{
275	struct r600_bytecode_cf *cf = r600_bytecode_cf();
276
277	if (cf == NULL)
278		return -ENOMEM;
279	LIST_ADDTAIL(&cf->list, &bc->cf);
280	if (bc->cf_last) {
281		cf->id = bc->cf_last->id + 2;
282		if (bc->cf_last->eg_alu_extended) {
283			/* take into account extended alu size */
284			cf->id += 2;
285			bc->ndw += 2;
286		}
287	}
288	bc->cf_last = cf;
289	bc->ncf++;
290	bc->ndw += 2;
291	bc->force_add_cf = 0;
292	bc->ar_loaded = 0;
293	return 0;
294}
295
296int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output)
297{
298	int r;
299
300	if (output->gpr >= bc->ngpr)
301		bc->ngpr = output->gpr + 1;
302
303	if (bc->cf_last && (bc->cf_last->inst == output->inst ||
304		(bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
305		output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
306		output->type == bc->cf_last->output.type &&
307		output->elem_size == bc->cf_last->output.elem_size &&
308		output->swizzle_x == bc->cf_last->output.swizzle_x &&
309		output->swizzle_y == bc->cf_last->output.swizzle_y &&
310		output->swizzle_z == bc->cf_last->output.swizzle_z &&
311		output->swizzle_w == bc->cf_last->output.swizzle_w &&
312		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
313
314		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
315			(output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
316
317			bc->cf_last->output.end_of_program |= output->end_of_program;
318			bc->cf_last->output.inst = output->inst;
319			bc->cf_last->output.gpr = output->gpr;
320			bc->cf_last->output.array_base = output->array_base;
321			bc->cf_last->output.burst_count += output->burst_count;
322			return 0;
323
324		} else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
325			output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
326
327			bc->cf_last->output.end_of_program |= output->end_of_program;
328			bc->cf_last->output.inst = output->inst;
329			bc->cf_last->output.burst_count += output->burst_count;
330			return 0;
331		}
332	}
333
334	r = r600_bytecode_add_cf(bc);
335	if (r)
336		return r;
337	bc->cf_last->inst = output->inst;
338	memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
339	return 0;
340}
341
342/* alu instructions that can ony exits once per group */
343static int is_alu_once_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
344{
345	switch (bc->chip_class) {
346	case R600:
347	case R700:
348		return !alu->is_op3 && (
349			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
350			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
351			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
352			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
353			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
354			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
355			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
356			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
357			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
358			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
359			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
360			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
361			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
362			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
363			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
364			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
365			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
366			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
367			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
368			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
369			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
370			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
371			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
372			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
373			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
374			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
375			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
376			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
377			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
378			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
379			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
380			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
381			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
382			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
383	case EVERGREEN:
384	case CAYMAN:
385	default:
386		return !alu->is_op3 && (
387			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
388			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
389			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
390			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
391			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
392			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
393			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
394			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
395			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
396			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
397			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
398			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
399			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
400			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
401			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
402			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
403			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
404			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
405			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
406			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
407			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
408			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
409			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
410			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
411			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
412			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
413			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
414			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
415			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
416			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
417			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
418			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
419			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
420			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
421	}
422}
423
424static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
425{
426	switch (bc->chip_class) {
427	case R600:
428	case R700:
429		return !alu->is_op3 && (
430			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
431			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
432			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
433			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
434	case EVERGREEN:
435	case CAYMAN:
436	default:
437		return !alu->is_op3 && (
438			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
439			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
440			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
441			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
442	}
443}
444
445static int is_alu_cube_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
446{
447	switch (bc->chip_class) {
448	case R600:
449	case R700:
450		return !alu->is_op3 &&
451			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
452	case EVERGREEN:
453	case CAYMAN:
454	default:
455		return !alu->is_op3 &&
456			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
457	}
458}
459
460static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
461{
462	switch (bc->chip_class) {
463	case R600:
464	case R700:
465		return !alu->is_op3 && (
466			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
467			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
468			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
469			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
470	case EVERGREEN:
471	case CAYMAN:
472	default:
473		return !alu->is_op3 && (
474			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
475	}
476}
477
478static int is_opcode_in_range(unsigned opcode, unsigned min, unsigned max)
479{
480	return min <= opcode && opcode <= max;
481}
482
483/* ALU instructions that can only execute on the vector unit:
484 *
485 * opcode ranges:
486 * R6xx/R7xx:
487 *   op3 : [0x08 - 0x0B]
488 *   op2 : 0x07, [0x15 - 0x18], [0x1B - 0x1D], [0x50 - 0x53], [0x7A - 0x7E]
489 *
490 * EVERGREEN:
491 *   op3: [0x04 - 0x11]
492 *   op2: [0xA0 - 0xE2]
493 */
494static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
495{
496	switch (bc->chip_class) {
497	case R600:
498	case R700:
499		if (alu->is_op3)
500			return is_opcode_in_range(alu->inst,
501					V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_64,
502					V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_64_D2);
503		else
504			return (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FREXP_64) ||
505					is_opcode_in_range(alu->inst,
506						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA,
507						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT) ||
508					is_opcode_in_range(alu->inst,
509						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_64,
510						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT32_TO_FLT64) ||
511					is_opcode_in_range(alu->inst,
512						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4,
513						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4) ||
514					is_opcode_in_range(alu->inst,
515						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LDEXP_64,
516						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_64);
517
518	case EVERGREEN:
519		if (alu->is_op3)
520			return is_opcode_in_range(alu->inst,
521					EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_BFE_UINT,
522					EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_LDS_IDX_OP);
523		else
524			return is_opcode_in_range(alu->inst,
525					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_BFM_INT,
526					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P20);
527	case CAYMAN:
528	default:
529		assert(0);
530		return 0;
531	}
532}
533
534/* ALU instructions that can only execute on the trans unit:
535 *
536 * opcode ranges:
537 * R600:
538 *   op3: 0x0C
539 *   op2: [0x60 - 0x79]
540 *
541 * R700:
542 *   op3: 0x0C
543 *   op2: [0x60 - 0x6F], [0x73 - 0x79]
544 *
545 * EVERGREEN:
546 *   op3: 0x1F
547 *   op2: [0x81 - 0x9C]
548 */
549static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
550{
551
552	switch (bc->chip_class) {
553	case R600:
554		if (alu->is_op3)
555			return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
556		else
557			return is_opcode_in_range(alu->inst,
558					V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,
559					V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
560	case R700:
561		if (alu->is_op3)
562			return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
563		else
564			return is_opcode_in_range(alu->inst,
565						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,
566						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS) ||
567					is_opcode_in_range(alu->inst,
568							V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT,
569							V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
570	case EVERGREEN:
571		if (alu->is_op3)
572			return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
573		else
574			return is_opcode_in_range(alu->inst,
575					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE,
576					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
577	case CAYMAN:
578	default:
579		assert(0);
580		return 0;
581	}
582}
583
584/* alu instructions that can execute on any unit */
585static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
586{
587	return !is_alu_vec_unit_inst(bc, alu) &&
588		!is_alu_trans_unit_inst(bc, alu);
589}
590
591static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
592{
593	switch (bc->chip_class) {
594	case R600:
595	case R700:
596		return (!alu->is_op3 && alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
597	case EVERGREEN:
598	case CAYMAN:
599	default:
600		return (!alu->is_op3 && alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
601	}
602}
603
604static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
605			    struct r600_bytecode_alu *assignment[5])
606{
607	struct r600_bytecode_alu *alu;
608	unsigned i, chan, trans;
609	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
610
611	for (i = 0; i < max_slots; i++)
612		assignment[i] = NULL;
613
614	for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
615		chan = alu->dst.chan;
616		if (max_slots == 4)
617			trans = 0;
618		else if (is_alu_trans_unit_inst(bc, alu))
619			trans = 1;
620		else if (is_alu_vec_unit_inst(bc, alu))
621			trans = 0;
622		else if (assignment[chan])
623			trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
624		else
625			trans = 0;
626
627		if (trans) {
628			if (assignment[4]) {
629				assert(0); /* ALU.Trans has already been allocated. */
630				return -1;
631			}
632			assignment[4] = alu;
633		} else {
634			if (assignment[chan]) {
635				assert(0); /* ALU.chan has already been allocated. */
636				return -1;
637			}
638			assignment[chan] = alu;
639		}
640
641		if (alu->last)
642			break;
643	}
644	return 0;
645}
646
647struct alu_bank_swizzle {
648	int	hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
649	int	hw_cfile_addr[4];
650	int	hw_cfile_elem[4];
651};
652
653static const unsigned cycle_for_bank_swizzle_vec[][3] = {
654	[SQ_ALU_VEC_012] = { 0, 1, 2 },
655	[SQ_ALU_VEC_021] = { 0, 2, 1 },
656	[SQ_ALU_VEC_120] = { 1, 2, 0 },
657	[SQ_ALU_VEC_102] = { 1, 0, 2 },
658	[SQ_ALU_VEC_201] = { 2, 0, 1 },
659	[SQ_ALU_VEC_210] = { 2, 1, 0 }
660};
661
662static const unsigned cycle_for_bank_swizzle_scl[][3] = {
663	[SQ_ALU_SCL_210] = { 2, 1, 0 },
664	[SQ_ALU_SCL_122] = { 1, 2, 2 },
665	[SQ_ALU_SCL_212] = { 2, 1, 2 },
666	[SQ_ALU_SCL_221] = { 2, 2, 1 }
667};
668
669static void init_bank_swizzle(struct alu_bank_swizzle *bs)
670{
671	int i, cycle, component;
672	/* set up gpr use */
673	for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
674		for (component = 0; component < NUM_OF_COMPONENTS; component++)
675			 bs->hw_gpr[cycle][component] = -1;
676	for (i = 0; i < 4; i++)
677		bs->hw_cfile_addr[i] = -1;
678	for (i = 0; i < 4; i++)
679		bs->hw_cfile_elem[i] = -1;
680}
681
682static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
683{
684	if (bs->hw_gpr[cycle][chan] == -1)
685		bs->hw_gpr[cycle][chan] = sel;
686	else if (bs->hw_gpr[cycle][chan] != (int)sel) {
687		/* Another scalar operation has already used the GPR read port for the channel. */
688		return -1;
689	}
690	return 0;
691}
692
693static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
694{
695	int res, num_res = 4;
696	if (bc->chip_class >= R700) {
697		num_res = 2;
698		chan /= 2;
699	}
700	for (res = 0; res < num_res; ++res) {
701		if (bs->hw_cfile_addr[res] == -1) {
702			bs->hw_cfile_addr[res] = sel;
703			bs->hw_cfile_elem[res] = chan;
704			return 0;
705		} else if (bs->hw_cfile_addr[res] == sel &&
706			bs->hw_cfile_elem[res] == chan)
707			return 0; /* Read for this scalar element already reserved, nothing to do here. */
708	}
709	/* All cfile read ports are used, cannot reference vector element. */
710	return -1;
711}
712
713static int is_gpr(unsigned sel)
714{
715	return (sel >= 0 && sel <= 127);
716}
717
718/* CB constants start at 512, and get translated to a kcache index when ALU
719 * clauses are constructed. Note that we handle kcache constants the same way
720 * as (the now gone) cfile constants, is that really required? */
721static int is_cfile(unsigned sel)
722{
723	return (sel > 255 && sel < 512) ||
724		(sel > 511 && sel < 4607) || /* Kcache before translation. */
725		(sel > 127 && sel < 192); /* Kcache after translation. */
726}
727
728static int is_const(int sel)
729{
730	return is_cfile(sel) ||
731		(sel >= V_SQ_ALU_SRC_0 &&
732		sel <= V_SQ_ALU_SRC_LITERAL);
733}
734
735static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
736			struct alu_bank_swizzle *bs, int bank_swizzle)
737{
738	int r, src, num_src, sel, elem, cycle;
739
740	num_src = r600_bytecode_get_num_operands(bc, alu);
741	for (src = 0; src < num_src; src++) {
742		sel = alu->src[src].sel;
743		elem = alu->src[src].chan;
744		if (is_gpr(sel)) {
745			cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
746			if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
747				/* Nothing to do; special-case optimization,
748				 * second source uses first source’s reservation. */
749				continue;
750			else {
751				r = reserve_gpr(bs, sel, elem, cycle);
752				if (r)
753					return r;
754			}
755		} else if (is_cfile(sel)) {
756			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
757			if (r)
758				return r;
759		}
760		/* No restrictions on PV, PS, literal or special constants. */
761	}
762	return 0;
763}
764
765static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
766			struct alu_bank_swizzle *bs, int bank_swizzle)
767{
768	int r, src, num_src, const_count, sel, elem, cycle;
769
770	num_src = r600_bytecode_get_num_operands(bc, alu);
771	for (const_count = 0, src = 0; src < num_src; ++src) {
772		sel = alu->src[src].sel;
773		elem = alu->src[src].chan;
774		if (is_const(sel)) { /* Any constant, including literal and inline constants. */
775			if (const_count >= 2)
776				/* More than two references to a constant in
777				 * transcendental operation. */
778				return -1;
779			else
780				const_count++;
781		}
782		if (is_cfile(sel)) {
783			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
784			if (r)
785				return r;
786		}
787	}
788	for (src = 0; src < num_src; ++src) {
789		sel = alu->src[src].sel;
790		elem = alu->src[src].chan;
791		if (is_gpr(sel)) {
792			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
793			if (cycle < const_count)
794				/* Cycle for GPR load conflicts with
795				 * constant load in transcendental operation. */
796				return -1;
797			r = reserve_gpr(bs, sel, elem, cycle);
798			if (r)
799				return r;
800		}
801		/* PV PS restrictions */
802		if (const_count && (sel == 254 || sel == 255)) {
803			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
804			if (cycle < const_count)
805				return -1;
806		}
807	}
808	return 0;
809}
810
811static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
812				      struct r600_bytecode_alu *slots[5])
813{
814	struct alu_bank_swizzle bs;
815	int bank_swizzle[5];
816	int i, r = 0, forced = 1;
817	boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
818	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
819
820	for (i = 0; i < max_slots; i++) {
821		if (slots[i]) {
822			if (slots[i]->bank_swizzle_force) {
823				slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
824			} else {
825				forced = 0;
826			}
827		}
828
829		if (i < 4 && slots[i])
830			scalar_only = false;
831	}
832	if (forced)
833		return 0;
834
835	/* Just check every possible combination of bank swizzle.
836	 * Not very efficent, but works on the first try in most of the cases. */
837	for (i = 0; i < 4; i++)
838		if (!slots[i] || !slots[i]->bank_swizzle_force)
839			bank_swizzle[i] = SQ_ALU_VEC_012;
840		else
841			bank_swizzle[i] = slots[i]->bank_swizzle;
842
843	bank_swizzle[4] = SQ_ALU_SCL_210;
844	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
845
846		if (max_slots == 4) {
847			for (i = 0; i < max_slots; i++) {
848				if (bank_swizzle[i] == SQ_ALU_VEC_210)
849				  return -1;
850			}
851		}
852		init_bank_swizzle(&bs);
853		if (scalar_only == false) {
854			for (i = 0; i < 4; i++) {
855				if (slots[i]) {
856					r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
857					if (r)
858						break;
859				}
860			}
861		} else
862			r = 0;
863
864		if (!r && slots[4] && max_slots == 5) {
865			r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
866		}
867		if (!r) {
868			for (i = 0; i < max_slots; i++) {
869				if (slots[i])
870					slots[i]->bank_swizzle = bank_swizzle[i];
871			}
872			return 0;
873		}
874
875		if (scalar_only) {
876			bank_swizzle[4]++;
877		} else {
878			for (i = 0; i < max_slots; i++) {
879				if (!slots[i] || !slots[i]->bank_swizzle_force) {
880					bank_swizzle[i]++;
881					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
882						break;
883					else
884						bank_swizzle[i] = SQ_ALU_VEC_012;
885				}
886			}
887		}
888	}
889
890	/* Couldn't find a working swizzle. */
891	return -1;
892}
893
894static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
895				  struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
896{
897	struct r600_bytecode_alu *prev[5];
898	int gpr[5], chan[5];
899	int i, j, r, src, num_src;
900	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
901
902	r = assign_alu_units(bc, alu_prev, prev);
903	if (r)
904		return r;
905
906	for (i = 0; i < max_slots; ++i) {
907		if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
908			gpr[i] = prev[i]->dst.sel;
909			/* cube writes more than PV.X */
910			if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
911				chan[i] = 0;
912			else
913				chan[i] = prev[i]->dst.chan;
914		} else
915			gpr[i] = -1;
916	}
917
918	for (i = 0; i < max_slots; ++i) {
919		struct r600_bytecode_alu *alu = slots[i];
920		if(!alu)
921			continue;
922
923		num_src = r600_bytecode_get_num_operands(bc, alu);
924		for (src = 0; src < num_src; ++src) {
925			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
926				continue;
927
928			if (bc->chip_class < CAYMAN) {
929				if (alu->src[src].sel == gpr[4] &&
930				    alu->src[src].chan == chan[4] &&
931				    alu_prev->pred_sel == alu->pred_sel) {
932					alu->src[src].sel = V_SQ_ALU_SRC_PS;
933					alu->src[src].chan = 0;
934					continue;
935				}
936			}
937
938			for (j = 0; j < 4; ++j) {
939				if (alu->src[src].sel == gpr[j] &&
940					alu->src[src].chan == j &&
941				      alu_prev->pred_sel == alu->pred_sel) {
942					alu->src[src].sel = V_SQ_ALU_SRC_PV;
943					alu->src[src].chan = chan[j];
944					break;
945				}
946			}
947		}
948	}
949
950	return 0;
951}
952
953void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg)
954{
955	switch(value) {
956	case 0:
957		*sel = V_SQ_ALU_SRC_0;
958		break;
959	case 1:
960		*sel = V_SQ_ALU_SRC_1_INT;
961		break;
962	case -1:
963		*sel = V_SQ_ALU_SRC_M_1_INT;
964		break;
965	case 0x3F800000: /* 1.0f */
966		*sel = V_SQ_ALU_SRC_1;
967		break;
968	case 0x3F000000: /* 0.5f */
969		*sel = V_SQ_ALU_SRC_0_5;
970		break;
971	case 0xBF800000: /* -1.0f */
972		*sel = V_SQ_ALU_SRC_1;
973		*neg ^= 1;
974		break;
975	case 0xBF000000: /* -0.5f */
976		*sel = V_SQ_ALU_SRC_0_5;
977		*neg ^= 1;
978		break;
979	default:
980		*sel = V_SQ_ALU_SRC_LITERAL;
981		break;
982	}
983}
984
985/* compute how many literal are needed */
986static int r600_bytecode_alu_nliterals(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
987				 uint32_t literal[4], unsigned *nliteral)
988{
989	unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
990	unsigned i, j;
991
992	for (i = 0; i < num_src; ++i) {
993		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
994			uint32_t value = alu->src[i].value;
995			unsigned found = 0;
996			for (j = 0; j < *nliteral; ++j) {
997				if (literal[j] == value) {
998					found = 1;
999					break;
1000				}
1001			}
1002			if (!found) {
1003				if (*nliteral >= 4)
1004					return -EINVAL;
1005				literal[(*nliteral)++] = value;
1006			}
1007		}
1008	}
1009	return 0;
1010}
1011
1012static void r600_bytecode_alu_adjust_literals(struct r600_bytecode *bc,
1013					struct r600_bytecode_alu *alu,
1014					uint32_t literal[4], unsigned nliteral)
1015{
1016	unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
1017	unsigned i, j;
1018
1019	for (i = 0; i < num_src; ++i) {
1020		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1021			uint32_t value = alu->src[i].value;
1022			for (j = 0; j < nliteral; ++j) {
1023				if (literal[j] == value) {
1024					alu->src[i].chan = j;
1025					break;
1026				}
1027			}
1028		}
1029	}
1030}
1031
1032static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
1033			     struct r600_bytecode_alu *alu_prev)
1034{
1035	struct r600_bytecode_alu *prev[5];
1036	struct r600_bytecode_alu *result[5] = { NULL };
1037
1038	uint32_t literal[4], prev_literal[4];
1039	unsigned nliteral = 0, prev_nliteral = 0;
1040
1041	int i, j, r, src, num_src;
1042	int num_once_inst = 0;
1043	int have_mova = 0, have_rel = 0;
1044	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1045
1046	r = assign_alu_units(bc, alu_prev, prev);
1047	if (r)
1048		return r;
1049
1050	for (i = 0; i < max_slots; ++i) {
1051		struct r600_bytecode_alu *alu;
1052
1053		/* check number of literals */
1054		if (prev[i]) {
1055			if (r600_bytecode_alu_nliterals(bc, prev[i], literal, &nliteral))
1056				return 0;
1057			if (r600_bytecode_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
1058				return 0;
1059			if (is_alu_mova_inst(bc, prev[i])) {
1060				if (have_rel)
1061					return 0;
1062				have_mova = 1;
1063			}
1064			num_once_inst += is_alu_once_inst(bc, prev[i]);
1065		}
1066		if (slots[i] && r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral))
1067			return 0;
1068
1069		/* Let's check used slots. */
1070		if (prev[i] && !slots[i]) {
1071			result[i] = prev[i];
1072			continue;
1073		} else if (prev[i] && slots[i]) {
1074			if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
1075				/* Trans unit is still free try to use it. */
1076				if (is_alu_any_unit_inst(bc, slots[i])) {
1077					result[i] = prev[i];
1078					result[4] = slots[i];
1079				} else if (is_alu_any_unit_inst(bc, prev[i])) {
1080					if (slots[i]->dst.sel == prev[i]->dst.sel &&
1081						(slots[i]->dst.write == 1 || slots[i]->is_op3) &&
1082						(prev[i]->dst.write == 1 || prev[i]->is_op3))
1083						return 0;
1084
1085					result[i] = slots[i];
1086					result[4] = prev[i];
1087				} else
1088					return 0;
1089			} else
1090				return 0;
1091		} else if(!slots[i]) {
1092			continue;
1093		} else {
1094			if (max_slots == 5 && slots[i] && prev[4] &&
1095					slots[i]->dst.sel == prev[4]->dst.sel &&
1096					slots[i]->dst.chan == prev[4]->dst.chan &&
1097					(slots[i]->dst.write == 1 || slots[i]->is_op3) &&
1098					(prev[4]->dst.write == 1 || prev[4]->is_op3))
1099				return 0;
1100
1101			result[i] = slots[i];
1102		}
1103
1104		alu = slots[i];
1105		num_once_inst += is_alu_once_inst(bc, alu);
1106
1107		/* don't reschedule NOPs */
1108		if (is_nop_inst(bc, alu))
1109			return 0;
1110
1111		/* Let's check dst gpr. */
1112		if (alu->dst.rel) {
1113			if (have_mova)
1114				return 0;
1115			have_rel = 1;
1116		}
1117
1118		/* Let's check source gprs */
1119		num_src = r600_bytecode_get_num_operands(bc, alu);
1120		for (src = 0; src < num_src; ++src) {
1121			if (alu->src[src].rel) {
1122				if (have_mova)
1123					return 0;
1124				have_rel = 1;
1125			}
1126
1127			/* Constants don't matter. */
1128			if (!is_gpr(alu->src[src].sel))
1129				continue;
1130
1131			for (j = 0; j < max_slots; ++j) {
1132				if (!prev[j] || !(prev[j]->dst.write || prev[j]->is_op3))
1133					continue;
1134
1135				/* If it's relative then we can't determin which gpr is really used. */
1136				if (prev[j]->dst.chan == alu->src[src].chan &&
1137					(prev[j]->dst.sel == alu->src[src].sel ||
1138					prev[j]->dst.rel || alu->src[src].rel))
1139					return 0;
1140			}
1141		}
1142	}
1143
1144	/* more than one PRED_ or KILL_ ? */
1145	if (num_once_inst > 1)
1146		return 0;
1147
1148	/* check if the result can still be swizzlet */
1149	r = check_and_set_bank_swizzle(bc, result);
1150	if (r)
1151		return 0;
1152
1153	/* looks like everything worked out right, apply the changes */
1154
1155	/* undo adding previus literals */
1156	bc->cf_last->ndw -= align(prev_nliteral, 2);
1157
1158	/* sort instructions */
1159	for (i = 0; i < max_slots; ++i) {
1160		slots[i] = result[i];
1161		if (result[i]) {
1162			LIST_DEL(&result[i]->list);
1163			result[i]->last = 0;
1164			LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1165		}
1166	}
1167
1168	/* determine new last instruction */
1169	LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
1170
1171	/* determine new first instruction */
1172	for (i = 0; i < max_slots; ++i) {
1173		if (result[i]) {
1174			bc->cf_last->curr_bs_head = result[i];
1175			break;
1176		}
1177	}
1178
1179	bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1180	bc->cf_last->prev2_bs_head = NULL;
1181
1182	return 0;
1183}
1184
1185/* we'll keep kcache sets sorted by bank & addr */
1186static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
1187		struct r600_bytecode_kcache *kcache,
1188		unsigned bank, unsigned line)
1189{
1190	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
1191
1192	for (i = 0; i < kcache_banks; i++) {
1193		if (kcache[i].mode) {
1194			int d;
1195
1196			if (kcache[i].bank < bank)
1197				continue;
1198
1199			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
1200					kcache[i].bank > bank) {
1201				/* try to insert new line */
1202				if (kcache[kcache_banks-1].mode) {
1203					/* all sets are in use */
1204					return -ENOMEM;
1205				}
1206
1207				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
1208				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1209				kcache[i].bank = bank;
1210				kcache[i].addr = line;
1211				return 0;
1212			}
1213
1214			d = line - kcache[i].addr;
1215
1216			if (d == -1) {
1217				kcache[i].addr--;
1218				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
1219					/* we are prepending the line to the current set,
1220					 * discarding the existing second line,
1221					 * so we'll have to insert line+2 after it */
1222					line += 2;
1223					continue;
1224				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
1225					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1226					return 0;
1227				} else {
1228					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
1229					return -ENOMEM;
1230				}
1231			} else if (d == 1) {
1232				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1233				return 0;
1234			} else if (d == 0)
1235				return 0;
1236		} else { /* free kcache set - use it */
1237			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1238			kcache[i].bank = bank;
1239			kcache[i].addr = line;
1240			return 0;
1241		}
1242	}
1243	return -ENOMEM;
1244}
1245
1246static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
1247		struct r600_bytecode_kcache *kcache,
1248		struct r600_bytecode_alu *alu)
1249{
1250	int i, r;
1251
1252	for (i = 0; i < 3; i++) {
1253		unsigned bank, line, sel = alu->src[i].sel;
1254
1255		if (sel < 512)
1256			continue;
1257
1258		bank = alu->src[i].kc_bank;
1259		line = (sel-512)>>4;
1260
1261		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
1262			return r;
1263	}
1264	return 0;
1265}
1266
1267static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
1268		struct r600_bytecode_alu *alu,
1269		struct r600_bytecode_kcache * kcache)
1270{
1271	int i, j;
1272
1273	/* Alter the src operands to refer to the kcache. */
1274	for (i = 0; i < 3; ++i) {
1275		static const unsigned int base[] = {128, 160, 256, 288};
1276		unsigned int line, sel = alu->src[i].sel, found = 0;
1277
1278		if (sel < 512)
1279			continue;
1280
1281		sel -= 512;
1282		line = sel>>4;
1283
1284		for (j = 0; j < 4 && !found; ++j) {
1285			switch (kcache[j].mode) {
1286			case V_SQ_CF_KCACHE_NOP:
1287			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
1288				R600_ERR("unexpected kcache line mode\n");
1289				return -ENOMEM;
1290			default:
1291				if (kcache[j].bank == alu->src[i].kc_bank &&
1292						kcache[j].addr <= line &&
1293						line < kcache[j].addr + kcache[j].mode) {
1294					alu->src[i].sel = sel - (kcache[j].addr<<4);
1295					alu->src[i].sel += base[j];
1296					found=1;
1297			    }
1298			}
1299		}
1300	}
1301	return 0;
1302}
1303
1304static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
1305{
1306	struct r600_bytecode_kcache kcache_sets[4];
1307	struct r600_bytecode_kcache *kcache = kcache_sets;
1308	int r;
1309
1310	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
1311
1312	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1313		/* can't alloc, need to start new clause */
1314		if ((r = r600_bytecode_add_cf(bc))) {
1315			return r;
1316		}
1317		bc->cf_last->inst = type;
1318
1319		/* retry with the new clause */
1320		kcache = bc->cf_last->kcache;
1321		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1322			/* can't alloc again- should never happen */
1323			return r;
1324		}
1325	} else {
1326		/* update kcache sets */
1327		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
1328	}
1329
1330	/* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
1331	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
1332		if (bc->chip_class < EVERGREEN)
1333			return -ENOMEM;
1334		bc->cf_last->eg_alu_extended = 1;
1335	}
1336
1337	return 0;
1338}
1339
1340static int insert_nop_r6xx(struct r600_bytecode *bc)
1341{
1342	struct r600_bytecode_alu alu;
1343	int r, i;
1344
1345	for (i = 0; i < 4; i++) {
1346		memset(&alu, 0, sizeof(alu));
1347		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
1348		alu.src[0].chan = i;
1349		alu.dst.chan = i;
1350		alu.last = (i == 3);
1351		r = r600_bytecode_add_alu(bc, &alu);
1352		if (r)
1353			return r;
1354	}
1355	return 0;
1356}
1357
1358/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1359static int load_ar_r6xx(struct r600_bytecode *bc)
1360{
1361	struct r600_bytecode_alu alu;
1362	int r;
1363
1364	if (bc->ar_loaded)
1365		return 0;
1366
1367	/* hack to avoid making MOVA the last instruction in the clause */
1368	if ((bc->cf_last->ndw>>1) >= 110)
1369		bc->force_add_cf = 1;
1370
1371	memset(&alu, 0, sizeof(alu));
1372	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
1373	alu.src[0].sel = bc->ar_reg;
1374	alu.last = 1;
1375	alu.index_mode = INDEX_MODE_LOOP;
1376	r = r600_bytecode_add_alu(bc, &alu);
1377	if (r)
1378		return r;
1379
1380	/* no requirement to set uses waterfall on MOVA_GPR_INT */
1381	bc->ar_loaded = 1;
1382	return 0;
1383}
1384
1385/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1386static int load_ar(struct r600_bytecode *bc)
1387{
1388	struct r600_bytecode_alu alu;
1389	int r;
1390
1391	if (bc->ar_handling)
1392		return load_ar_r6xx(bc);
1393
1394	if (bc->ar_loaded)
1395		return 0;
1396
1397	/* hack to avoid making MOVA the last instruction in the clause */
1398	if ((bc->cf_last->ndw>>1) >= 110)
1399		bc->force_add_cf = 1;
1400
1401	memset(&alu, 0, sizeof(alu));
1402	alu.inst = BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
1403	alu.src[0].sel = bc->ar_reg;
1404	alu.last = 1;
1405	r = r600_bytecode_add_alu(bc, &alu);
1406	if (r)
1407		return r;
1408
1409	bc->cf_last->r6xx_uses_waterfall = 1;
1410	bc->ar_loaded = 1;
1411	return 0;
1412}
1413
1414int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, int type)
1415{
1416	struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1417	struct r600_bytecode_alu *lalu;
1418	int i, r;
1419
1420	if (nalu == NULL)
1421		return -ENOMEM;
1422	memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1423
1424	if (bc->cf_last != NULL && bc->cf_last->inst != type) {
1425		/* check if we could add it anyway */
1426		if (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU) &&
1427			type == BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE)) {
1428			LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1429				if (lalu->execute_mask) {
1430					bc->force_add_cf = 1;
1431					break;
1432				}
1433			}
1434		} else
1435			bc->force_add_cf = 1;
1436	}
1437
1438	/* cf can contains only alu or only vtx or only tex */
1439	if (bc->cf_last == NULL || bc->force_add_cf) {
1440		r = r600_bytecode_add_cf(bc);
1441		if (r) {
1442			free(nalu);
1443			return r;
1444		}
1445	}
1446	bc->cf_last->inst = type;
1447
1448	/* Check AR usage and load it if required */
1449	for (i = 0; i < 3; i++)
1450		if (nalu->src[i].rel && !bc->ar_loaded)
1451			load_ar(bc);
1452
1453	if (nalu->dst.rel && !bc->ar_loaded)
1454		load_ar(bc);
1455
1456	/* Setup the kcache for this ALU instruction. This will start a new
1457	 * ALU clause if needed. */
1458	if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1459		free(nalu);
1460		return r;
1461	}
1462
1463	if (!bc->cf_last->curr_bs_head) {
1464		bc->cf_last->curr_bs_head = nalu;
1465	}
1466	/* number of gpr == the last gpr used in any alu */
1467	for (i = 0; i < 3; i++) {
1468		if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1469			bc->ngpr = nalu->src[i].sel + 1;
1470		}
1471		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1472			r600_bytecode_special_constants(nalu->src[i].value,
1473				&nalu->src[i].sel, &nalu->src[i].neg);
1474	}
1475	if (nalu->dst.sel >= bc->ngpr) {
1476		bc->ngpr = nalu->dst.sel + 1;
1477	}
1478	LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1479	/* each alu use 2 dwords */
1480	bc->cf_last->ndw += 2;
1481	bc->ndw += 2;
1482
1483	/* process cur ALU instructions for bank swizzle */
1484	if (nalu->last) {
1485		uint32_t literal[4];
1486		unsigned nliteral;
1487		struct r600_bytecode_alu *slots[5];
1488		int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1489		r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1490		if (r)
1491			return r;
1492
1493		if (bc->cf_last->prev_bs_head) {
1494			r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1495			if (r)
1496				return r;
1497		}
1498
1499		if (bc->cf_last->prev_bs_head) {
1500			r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1501			if (r)
1502				return r;
1503		}
1504
1505		r = check_and_set_bank_swizzle(bc, slots);
1506		if (r)
1507			return r;
1508
1509		for (i = 0, nliteral = 0; i < max_slots; i++) {
1510			if (slots[i]) {
1511				r = r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral);
1512				if (r)
1513					return r;
1514			}
1515		}
1516		bc->cf_last->ndw += align(nliteral, 2);
1517
1518		/* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1519		 * worst case */
1520		if ((bc->cf_last->ndw >> 1) >= 120) {
1521			bc->force_add_cf = 1;
1522		}
1523
1524		bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1525		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1526		bc->cf_last->curr_bs_head = NULL;
1527	}
1528
1529	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
1530		insert_nop_r6xx(bc);
1531
1532	return 0;
1533}
1534
1535int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1536{
1537	return r600_bytecode_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1538}
1539
1540static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1541{
1542	switch (bc->chip_class) {
1543	case R600:
1544		return 8;
1545
1546	case R700:
1547	case EVERGREEN:
1548	case CAYMAN:
1549		return 16;
1550
1551	default:
1552		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1553		return 8;
1554	}
1555}
1556
1557static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
1558{
1559	switch (bc->chip_class) {
1560	case R700:
1561	case R600:
1562		return bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1563		       bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC;
1564	case EVERGREEN:
1565		return bc->cf_last->inst != EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1566	case CAYMAN:
1567		return bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1568	default:
1569		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1570		return FALSE;
1571	}
1572}
1573
1574int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1575{
1576	struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1577	int r;
1578
1579	if (nvtx == NULL)
1580		return -ENOMEM;
1581	memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1582
1583	/* cf can contains only alu or only vtx or only tex */
1584	if (bc->cf_last == NULL ||
1585	    last_inst_was_not_vtx_fetch(bc) ||
1586	    bc->force_add_cf) {
1587		r = r600_bytecode_add_cf(bc);
1588		if (r) {
1589			free(nvtx);
1590			return r;
1591		}
1592		switch (bc->chip_class) {
1593		case R600:
1594		case R700:
1595			bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1596			break;
1597		case EVERGREEN:
1598			bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1599			break;
1600		case CAYMAN:
1601			bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1602			break;
1603		default:
1604			R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1605			return -EINVAL;
1606		}
1607	}
1608	LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1609	/* each fetch use 4 dwords */
1610	bc->cf_last->ndw += 4;
1611	bc->ndw += 4;
1612	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1613		bc->force_add_cf = 1;
1614
1615	bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
1616	bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
1617
1618	return 0;
1619}
1620
1621int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1622{
1623	struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1624	int r;
1625
1626	if (ntex == NULL)
1627		return -ENOMEM;
1628	memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1629
1630	/* we can't fetch data und use it as texture lookup address in the same TEX clause */
1631	if (bc->cf_last != NULL &&
1632		bc->cf_last->inst == BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX)) {
1633		struct r600_bytecode_tex *ttex;
1634		LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1635			if (ttex->dst_gpr == ntex->src_gpr) {
1636				bc->force_add_cf = 1;
1637				break;
1638			}
1639		}
1640		/* slight hack to make gradients always go into same cf */
1641		if (ntex->inst == SQ_TEX_INST_SET_GRADIENTS_H)
1642			bc->force_add_cf = 1;
1643	}
1644
1645	/* cf can contains only alu or only vtx or only tex */
1646	if (bc->cf_last == NULL ||
1647		bc->cf_last->inst != BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX) ||
1648	        bc->force_add_cf) {
1649		r = r600_bytecode_add_cf(bc);
1650		if (r) {
1651			free(ntex);
1652			return r;
1653		}
1654		bc->cf_last->inst = BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX);
1655	}
1656	if (ntex->src_gpr >= bc->ngpr) {
1657		bc->ngpr = ntex->src_gpr + 1;
1658	}
1659	if (ntex->dst_gpr >= bc->ngpr) {
1660		bc->ngpr = ntex->dst_gpr + 1;
1661	}
1662	LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1663	/* each texture fetch use 4 dwords */
1664	bc->cf_last->ndw += 4;
1665	bc->ndw += 4;
1666	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1667		bc->force_add_cf = 1;
1668	return 0;
1669}
1670
1671int r600_bytecode_add_cfinst(struct r600_bytecode *bc, int inst)
1672{
1673	int r;
1674	r = r600_bytecode_add_cf(bc);
1675	if (r)
1676		return r;
1677
1678	bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1679	bc->cf_last->inst = inst;
1680	return 0;
1681}
1682
1683int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1684{
1685	return r600_bytecode_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
1686}
1687
1688/* common to all 3 families */
1689static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1690{
1691	bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1692			S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1693			S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1694			S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1695	if (bc->chip_class < CAYMAN)
1696		bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1697	id++;
1698	bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1699				S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1700				S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1701				S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1702				S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1703				S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1704				S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1705				S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1706				S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1707				S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1708	bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1709				S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1710	if (bc->chip_class < CAYMAN)
1711		bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1712	id++;
1713	bc->bytecode[id++] = 0;
1714	return 0;
1715}
1716
1717/* common to all 3 families */
1718static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1719{
1720	bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1721				S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1722				S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1723				S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1724	bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1725				S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1726				S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1727				S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1728				S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1729				S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1730				S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1731				S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1732				S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1733				S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1734				S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1735	bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1736				S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1737				S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1738				S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1739				S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1740				S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1741				S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1742				S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1743	bc->bytecode[id++] = 0;
1744	return 0;
1745}
1746
1747/* r600 only, r700/eg bits in r700_asm.c */
1748static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1749{
1750	/* don't replace gpr by pv or ps for destination register */
1751	bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1752				S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1753				S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1754				S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1755				S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1756				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1757				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1758				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1759				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
1760				S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
1761				S_SQ_ALU_WORD0_LAST(alu->last);
1762
1763	if (alu->is_op3) {
1764		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1765					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1766					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1767					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1768					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1769					S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1770					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1771					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1772					S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1773					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1774	} else {
1775		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1776					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1777					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1778					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1779					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1780					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1781					S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1782					S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1783					S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1784					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1785					S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
1786					S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
1787	}
1788	return 0;
1789}
1790
1791static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1792{
1793	*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1794	*bytecode++ = cf->inst |
1795			S_SQ_CF_WORD1_BARRIER(1) |
1796			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1797}
1798
1799/* common for r600/r700 - eg in eg_asm.c */
1800static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1801{
1802	unsigned id = cf->id;
1803
1804	switch (cf->inst) {
1805	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1806	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
1807	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
1808	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
1809		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1810			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1811			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1812			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1813
1814		bc->bytecode[id++] = cf->inst |
1815			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1816			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1817			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1818					S_SQ_CF_ALU_WORD1_BARRIER(1) |
1819					S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1820					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1821		break;
1822	case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1823	case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1824	case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1825		if (bc->chip_class == R700)
1826			r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1827		else
1828			r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1829		break;
1830	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1831	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1832		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1833			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1834			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1835			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1836		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1837			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1838			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1839			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1840			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1841			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1842			cf->output.inst |
1843			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1844		break;
1845	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
1846	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
1847	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
1848	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
1849		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1850			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1851			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1852			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1853		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1854			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1855			cf->output.inst |
1856			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
1857			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
1858			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
1859		break;
1860	case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1861	case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1862	case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1863	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1864	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1865	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1866	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1867	case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1868	case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1869		bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1870		bc->bytecode[id++] = cf->inst |
1871					S_SQ_CF_WORD1_BARRIER(1) |
1872			                S_SQ_CF_WORD1_COND(cf->cond) |
1873			                S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1874
1875		break;
1876	default:
1877		R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1878		return -EINVAL;
1879	}
1880	return 0;
1881}
1882
1883int r600_bytecode_build(struct r600_bytecode *bc)
1884{
1885	struct r600_bytecode_cf *cf;
1886	struct r600_bytecode_alu *alu;
1887	struct r600_bytecode_vtx *vtx;
1888	struct r600_bytecode_tex *tex;
1889	uint32_t literal[4];
1890	unsigned nliteral;
1891	unsigned addr;
1892	int i, r;
1893
1894	if (bc->callstack[0].max > 0)
1895		bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1896	if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1897		bc->nstack = 1;
1898	}
1899
1900	/* first path compute addr of each CF block */
1901	/* addr start after all the CF instructions */
1902	addr = bc->cf_last->id + 2;
1903	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1904		if (bc->chip_class >= EVERGREEN) {
1905			switch (cf->inst) {
1906			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1907			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1908				/* fetch node need to be 16 bytes aligned*/
1909				addr += 3;
1910				addr &= 0xFFFFFFFCUL;
1911				break;
1912			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1913			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
1914			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
1915			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
1916			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1917			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1918			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
1919			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
1920			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
1921			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
1922			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
1923			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
1924			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
1925			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
1926			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
1927			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
1928			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
1929			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
1930			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
1931			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
1932			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
1933			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
1934			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1935			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1936			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
1937			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1938			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1939			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1940			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1941			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1942			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1943			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1944			case CF_NATIVE:
1945				break;
1946			default:
1947				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1948				return -EINVAL;
1949			}
1950		} else {
1951			switch (cf->inst) {
1952			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1953			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1954			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1955				/* fetch node need to be 16 bytes aligned*/
1956				addr += 3;
1957				addr &= 0xFFFFFFFCUL;
1958				break;
1959			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
1960			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
1961			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
1962			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
1963			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1964			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1965			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
1966			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
1967			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
1968			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
1969			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1970			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1971			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1972			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1973			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1974			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1975			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1976			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1977			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1978				break;
1979			default:
1980				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1981				return -EINVAL;
1982			}
1983		}
1984		cf->addr = addr;
1985		addr += cf->ndw;
1986		bc->ndw = cf->addr + cf->ndw;
1987	}
1988	free(bc->bytecode);
1989	bc->bytecode = calloc(1, bc->ndw * 4);
1990	if (bc->bytecode == NULL)
1991		return -ENOMEM;
1992	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1993		addr = cf->addr;
1994		if (bc->chip_class >= EVERGREEN) {
1995			r = eg_bytecode_cf_build(bc, cf);
1996			if (r)
1997				return r;
1998
1999			switch (cf->inst) {
2000			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2001			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2002			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2003			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2004				nliteral = 0;
2005				memset(literal, 0, sizeof(literal));
2006				LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2007					r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2008					if (r)
2009						return r;
2010					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
2011					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
2012
2013					switch(bc->chip_class) {
2014					case EVERGREEN: /* eg alu is same encoding as r700 */
2015					case CAYMAN:
2016						r = r700_bytecode_alu_build(bc, alu, addr);
2017						break;
2018					default:
2019						R600_ERR("unknown chip class %d.\n", bc->chip_class);
2020						return -EINVAL;
2021					}
2022					if (r)
2023						return r;
2024					addr += 2;
2025					if (alu->last) {
2026						for (i = 0; i < align(nliteral, 2); ++i) {
2027							bc->bytecode[addr++] = literal[i];
2028						}
2029						nliteral = 0;
2030						memset(literal, 0, sizeof(literal));
2031					}
2032				}
2033				break;
2034			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2035				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2036					r = r600_bytecode_vtx_build(bc, vtx, addr);
2037					if (r)
2038						return r;
2039					addr += 4;
2040				}
2041				break;
2042			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2043				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2044					assert(bc->chip_class >= EVERGREEN);
2045					r = r600_bytecode_vtx_build(bc, vtx, addr);
2046					if (r)
2047						return r;
2048					addr += 4;
2049				}
2050				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2051					r = r600_bytecode_tex_build(bc, tex, addr);
2052					if (r)
2053						return r;
2054					addr += 4;
2055				}
2056				break;
2057			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2058			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2059			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
2060			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
2061			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
2062			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
2063			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
2064			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
2065			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
2066			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
2067			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
2068			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
2069			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
2070			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
2071			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
2072			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
2073			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
2074			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
2075			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2076			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2077			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2078			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2079			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2080			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2081			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
2082			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2083			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2084			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
2085				break;
2086			case CF_NATIVE:
2087				break;
2088			default:
2089				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2090				return -EINVAL;
2091			}
2092		} else {
2093			r = r600_bytecode_cf_build(bc, cf);
2094			if (r)
2095				return r;
2096
2097			switch (cf->inst) {
2098			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2099			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2100			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2101			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2102				nliteral = 0;
2103				memset(literal, 0, sizeof(literal));
2104				LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2105					r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2106					if (r)
2107						return r;
2108					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
2109					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
2110
2111					switch(bc->chip_class) {
2112					case R600:
2113						r = r600_bytecode_alu_build(bc, alu, addr);
2114						break;
2115					case R700:
2116						r = r700_bytecode_alu_build(bc, alu, addr);
2117						break;
2118					default:
2119						R600_ERR("unknown chip class %d.\n", bc->chip_class);
2120						return -EINVAL;
2121					}
2122					if (r)
2123						return r;
2124					addr += 2;
2125					if (alu->last) {
2126						for (i = 0; i < align(nliteral, 2); ++i) {
2127							bc->bytecode[addr++] = literal[i];
2128						}
2129						nliteral = 0;
2130						memset(literal, 0, sizeof(literal));
2131					}
2132				}
2133				break;
2134			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2135			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
2136				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2137					r = r600_bytecode_vtx_build(bc, vtx, addr);
2138					if (r)
2139						return r;
2140					addr += 4;
2141				}
2142				break;
2143			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2144				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2145					r = r600_bytecode_tex_build(bc, tex, addr);
2146					if (r)
2147						return r;
2148					addr += 4;
2149				}
2150				break;
2151			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2152			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2153			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
2154			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
2155			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
2156			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
2157			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2158			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2159			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2160			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2161			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2162			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2163			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
2164			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2165			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2166				break;
2167			default:
2168				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
2169				return -EINVAL;
2170			}
2171		}
2172	}
2173	return 0;
2174}
2175
2176void r600_bytecode_clear(struct r600_bytecode *bc)
2177{
2178	struct r600_bytecode_cf *cf = NULL, *next_cf;
2179
2180	free(bc->bytecode);
2181	bc->bytecode = NULL;
2182
2183	LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
2184		struct r600_bytecode_alu *alu = NULL, *next_alu;
2185		struct r600_bytecode_tex *tex = NULL, *next_tex;
2186		struct r600_bytecode_tex *vtx = NULL, *next_vtx;
2187
2188		LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
2189			free(alu);
2190		}
2191
2192		LIST_INITHEAD(&cf->alu);
2193
2194		LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
2195			free(tex);
2196		}
2197
2198		LIST_INITHEAD(&cf->tex);
2199
2200		LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
2201			free(vtx);
2202		}
2203
2204		LIST_INITHEAD(&cf->vtx);
2205
2206		free(cf);
2207	}
2208
2209	LIST_INITHEAD(&cf->list);
2210}
2211
2212void r600_bytecode_dump(struct r600_bytecode *bc)
2213{
2214	struct r600_bytecode_cf *cf = NULL;
2215	struct r600_bytecode_alu *alu = NULL;
2216	struct r600_bytecode_vtx *vtx = NULL;
2217	struct r600_bytecode_tex *tex = NULL;
2218
2219	unsigned i, id;
2220	uint32_t literal[4];
2221	unsigned nliteral;
2222	char chip = '6';
2223
2224	switch (bc->chip_class) {
2225	case R700:
2226		chip = '7';
2227		break;
2228	case EVERGREEN:
2229		chip = 'E';
2230		break;
2231	case CAYMAN:
2232		chip = 'C';
2233		break;
2234	case R600:
2235	default:
2236		chip = '6';
2237		break;
2238	}
2239	fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
2240	fprintf(stderr, "     %c\n", chip);
2241
2242	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2243		id = cf->id;
2244
2245		if (bc->chip_class >= EVERGREEN) {
2246			switch (cf->inst) {
2247			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2248			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2249			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2250			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2251				if (cf->eg_alu_extended) {
2252					fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
2253					fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
2254					fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
2255					fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
2256					id++;
2257					fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
2258					fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
2259					fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
2260					fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
2261					id++;
2262				}
2263
2264				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2265				fprintf(stderr, "ADDR:%d ", cf->addr);
2266				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2267				fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2268				fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2269				id++;
2270				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2271				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_ALU_WORD1_CF_INST(cf->inst));
2272				fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2273				fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2274				fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2275				fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2276				break;
2277			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2278			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2279				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2280				fprintf(stderr, "ADDR:%d\n", cf->addr);
2281				id++;
2282				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2283				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_WORD1_CF_INST(cf->inst));
2284				fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2285				break;
2286			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2287			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2288				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2289				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2290				fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2291				fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2292				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2293				id++;
2294				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2295				fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2296				fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2297				fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2298				fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2299				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2300				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst));
2301				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2302				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2303				break;
2304			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
2305			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
2306			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
2307			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
2308			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
2309			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
2310			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
2311			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
2312			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
2313			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
2314			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
2315			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
2316			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
2317			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
2318			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
2319			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
2320				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
2321					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2322					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
2323					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2324					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
2325				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2326				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
2327				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
2328				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2329				id++;
2330				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
2331					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2332					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
2333					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2334					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
2335				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
2336				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
2337				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2338				fprintf(stderr, "INST:%d ", cf->output.inst);
2339				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2340				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2341				break;
2342			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2343			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2344			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
2345			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2346			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2347			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2348			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2349			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2350			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2351			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
2352				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2353				fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
2354				id++;
2355				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2356				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_WORD1_CF_INST(cf->inst));
2357				fprintf(stderr, "COND:%X ", cf->cond);
2358				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2359				break;
2360			case CF_NATIVE:
2361				fprintf(stderr, "%04d %08X CF NATIVE\n", id, bc->bytecode[id]);
2362				fprintf(stderr, "%04d %08X CF NATIVE\n", id + 1, bc->bytecode[id + 1]);
2363				break;
2364			default:
2365				R600_ERR("Unknown instruction %0x\n", cf->inst);
2366			}
2367		} else {
2368			switch (cf->inst) {
2369			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
2370			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
2371			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
2372			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
2373				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2374				fprintf(stderr, "ADDR:%d ", cf->addr);
2375				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2376				fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2377				fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2378				id++;
2379				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2380				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_ALU_WORD1_CF_INST(cf->inst));
2381				fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2382				fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2383				fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2384				fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2385				break;
2386			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
2387			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
2388			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
2389				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2390				fprintf(stderr, "ADDR:%d\n", cf->addr);
2391				id++;
2392				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2393				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_WORD1_CF_INST(cf->inst));
2394				fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2395				break;
2396			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
2397			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
2398				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2399				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2400				fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2401				fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2402				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2403				id++;
2404				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
2405				fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2406				fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2407				fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2408				fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2409				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2410				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst));
2411				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2412				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2413				break;
2414			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
2415			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
2416			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
2417			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
2418				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
2419					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2420					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
2421				fprintf(stderr, "GPR:%X ", cf->output.gpr);
2422				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
2423				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
2424				fprintf(stderr, "TYPE:%X\n", cf->output.type);
2425				id++;
2426				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
2427					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
2428					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
2429				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
2430				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
2431				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2432				fprintf(stderr, "INST:%d ", cf->output.inst);
2433				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2434				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2435				break;
2436			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
2437			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
2438			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
2439			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
2440			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
2441			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
2442			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
2443			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
2444			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
2445				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2446				fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
2447				id++;
2448				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2449				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_WORD1_CF_INST(cf->inst));
2450				fprintf(stderr, "COND:%X ", cf->cond);
2451				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2452				break;
2453			default:
2454				R600_ERR("Unknown instruction %0x\n", cf->inst);
2455			}
2456		}
2457
2458		id = cf->addr;
2459		nliteral = 0;
2460		LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2461			r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2462
2463			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2464			fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
2465			fprintf(stderr, "REL:%d ", alu->src[0].rel);
2466			fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
2467			fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
2468			fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
2469			fprintf(stderr, "REL:%d ", alu->src[1].rel);
2470			fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
2471			fprintf(stderr, "NEG:%d ", alu->src[1].neg);
2472			fprintf(stderr, "IM:%d) ", alu->index_mode);
2473			fprintf(stderr, "PRED_SEL:%d ", alu->pred_sel);
2474			fprintf(stderr, "LAST:%d)\n", alu->last);
2475			id++;
2476			fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
2477			fprintf(stderr, "INST:0x%x ", alu->inst);
2478			fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
2479			fprintf(stderr, "CHAN:%d ", alu->dst.chan);
2480			fprintf(stderr, "REL:%d ", alu->dst.rel);
2481			fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
2482			fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
2483			if (alu->is_op3) {
2484				fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
2485				fprintf(stderr, "REL:%d ", alu->src[2].rel);
2486				fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
2487				fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
2488			} else {
2489				fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
2490				fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
2491				fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
2492				fprintf(stderr, "OMOD:%d ", alu->omod);
2493				fprintf(stderr, "EXECUTE_MASK:%d ", alu->execute_mask);
2494				fprintf(stderr, "UPDATE_PRED:%d\n", alu->update_pred);
2495			}
2496
2497			id++;
2498			if (alu->last) {
2499				for (i = 0; i < nliteral; i++, id++) {
2500					float *f = (float*)(bc->bytecode + id);
2501					fprintf(stderr, "%04d %08X\t%f (%d)\n", id, bc->bytecode[id], *f,
2502							*(bc->bytecode + id));
2503				}
2504				id += nliteral & 1;
2505				nliteral = 0;
2506			}
2507		}
2508
2509		LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2510			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2511			fprintf(stderr, "INST:0x%x ", tex->inst);
2512			fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
2513			fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
2514			fprintf(stderr, "REL:%d)\n", tex->src_rel);
2515			id++;
2516			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2517			fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
2518			fprintf(stderr, "REL:%d ", tex->dst_rel);
2519			fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
2520			fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
2521			fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
2522			fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
2523			fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
2524			fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
2525			fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
2526			fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
2527			fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
2528			id++;
2529			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2530			fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
2531			fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
2532			fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
2533			fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
2534			fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
2535			fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
2536			fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
2537			fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
2538			id++;
2539			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
2540			id++;
2541		}
2542
2543		LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2544			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2545			fprintf(stderr, "INST:%d ", vtx->inst);
2546			fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
2547			fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
2548			id++;
2549			/* This assumes that no semantic fetches exist */
2550			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2551			fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
2552			fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
2553			if (bc->chip_class < CAYMAN)
2554				fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
2555			else
2556				fprintf(stderr, "SEL_Y:%d) ", 0);
2557			fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
2558			fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
2559			fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
2560			fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
2561			fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
2562			fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
2563			fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
2564			fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2565			fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2566			fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2567			id++;
2568			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
2569			fprintf(stderr, "ENDIAN:%d ", vtx->endian);
2570			fprintf(stderr, "OFFSET:%d\n", vtx->offset);
2571			/* XXX */
2572			id++;
2573			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
2574			id++;
2575		}
2576	}
2577
2578	fprintf(stderr, "--------------------------------------\n");
2579}
2580
2581static void r600_vertex_data_type(enum pipe_format pformat,
2582				  unsigned *format,
2583				  unsigned *num_format, unsigned *format_comp, unsigned *endian)
2584{
2585	const struct util_format_description *desc;
2586	unsigned i;
2587
2588	*format = 0;
2589	*num_format = 0;
2590	*format_comp = 0;
2591	*endian = ENDIAN_NONE;
2592
2593	desc = util_format_description(pformat);
2594	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2595		goto out_unknown;
2596	}
2597
2598	/* Find the first non-VOID channel. */
2599	for (i = 0; i < 4; i++) {
2600		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2601			break;
2602		}
2603	}
2604
2605	*endian = r600_endian_swap(desc->channel[i].size);
2606
2607	switch (desc->channel[i].type) {
2608	/* Half-floats, floats, ints */
2609	case UTIL_FORMAT_TYPE_FLOAT:
2610		switch (desc->channel[i].size) {
2611		case 16:
2612			switch (desc->nr_channels) {
2613			case 1:
2614				*format = FMT_16_FLOAT;
2615				break;
2616			case 2:
2617				*format = FMT_16_16_FLOAT;
2618				break;
2619			case 3:
2620			case 4:
2621				*format = FMT_16_16_16_16_FLOAT;
2622				break;
2623			}
2624			break;
2625		case 32:
2626			switch (desc->nr_channels) {
2627			case 1:
2628				*format = FMT_32_FLOAT;
2629				break;
2630			case 2:
2631				*format = FMT_32_32_FLOAT;
2632				break;
2633			case 3:
2634				*format = FMT_32_32_32_FLOAT;
2635				break;
2636			case 4:
2637				*format = FMT_32_32_32_32_FLOAT;
2638				break;
2639			}
2640			break;
2641		default:
2642			goto out_unknown;
2643		}
2644		break;
2645		/* Unsigned ints */
2646	case UTIL_FORMAT_TYPE_UNSIGNED:
2647		/* Signed ints */
2648	case UTIL_FORMAT_TYPE_SIGNED:
2649		switch (desc->channel[i].size) {
2650		case 8:
2651			switch (desc->nr_channels) {
2652			case 1:
2653				*format = FMT_8;
2654				break;
2655			case 2:
2656				*format = FMT_8_8;
2657				break;
2658			case 3:
2659			case 4:
2660				*format = FMT_8_8_8_8;
2661				break;
2662			}
2663			break;
2664		case 10:
2665			if (desc->nr_channels != 4)
2666				goto out_unknown;
2667
2668			*format = FMT_2_10_10_10;
2669			break;
2670		case 16:
2671			switch (desc->nr_channels) {
2672			case 1:
2673				*format = FMT_16;
2674				break;
2675			case 2:
2676				*format = FMT_16_16;
2677				break;
2678			case 3:
2679			case 4:
2680				*format = FMT_16_16_16_16;
2681				break;
2682			}
2683			break;
2684		case 32:
2685			switch (desc->nr_channels) {
2686			case 1:
2687				*format = FMT_32;
2688				break;
2689			case 2:
2690				*format = FMT_32_32;
2691				break;
2692			case 3:
2693				*format = FMT_32_32_32;
2694				break;
2695			case 4:
2696				*format = FMT_32_32_32_32;
2697				break;
2698			}
2699			break;
2700		default:
2701			goto out_unknown;
2702		}
2703		break;
2704	default:
2705		goto out_unknown;
2706	}
2707
2708	if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2709		*format_comp = 1;
2710	}
2711
2712	*num_format = 0;
2713	if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2714	    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2715		if (!desc->channel[i].normalized) {
2716			if (desc->channel[i].pure_integer)
2717				*num_format = 1;
2718			else
2719				*num_format = 2;
2720		}
2721	}
2722	return;
2723out_unknown:
2724	R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2725}
2726
2727int r600_vertex_elements_build_fetch_shader(struct r600_context *rctx, struct r600_vertex_element *ve)
2728{
2729	static int dump_shaders = -1;
2730
2731	struct r600_bytecode bc;
2732	struct r600_bytecode_vtx vtx;
2733	struct pipe_vertex_element *elements = ve->elements;
2734	const struct util_format_description *desc;
2735	unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
2736	unsigned format, num_format, format_comp, endian;
2737	uint32_t *bytecode;
2738	int i, r;
2739
2740	memset(&bc, 0, sizeof(bc));
2741	r600_bytecode_init(&bc, rctx->chip_class, rctx->family);
2742
2743	for (i = 0; i < ve->count; i++) {
2744		if (elements[i].instance_divisor > 1) {
2745			struct r600_bytecode_alu alu;
2746
2747			memset(&alu, 0, sizeof(alu));
2748			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2749			alu.src[0].sel = 0;
2750			alu.src[0].chan = 3;
2751
2752			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2753			alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2754
2755			alu.dst.sel = i + 1;
2756			alu.dst.chan = 3;
2757			alu.dst.write = 1;
2758			alu.last = 1;
2759
2760			if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2761				r600_bytecode_clear(&bc);
2762				return r;
2763			}
2764		}
2765	}
2766
2767	for (i = 0; i < ve->count; i++) {
2768		r600_vertex_data_type(ve->elements[i].src_format,
2769				      &format, &num_format, &format_comp, &endian);
2770
2771		desc = util_format_description(ve->elements[i].src_format);
2772		if (desc == NULL) {
2773			r600_bytecode_clear(&bc);
2774			R600_ERR("unknown format %d\n", ve->elements[i].src_format);
2775			return -EINVAL;
2776		}
2777
2778		if (elements[i].src_offset > 65535) {
2779			r600_bytecode_clear(&bc);
2780			R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
2781			return -EINVAL;
2782		}
2783
2784		memset(&vtx, 0, sizeof(vtx));
2785		vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
2786		vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2787		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2788		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2789		vtx.mega_fetch_count = 0x1F;
2790		vtx.dst_gpr = i + 1;
2791		vtx.dst_sel_x = desc->swizzle[0];
2792		vtx.dst_sel_y = desc->swizzle[1];
2793		vtx.dst_sel_z = desc->swizzle[2];
2794		vtx.dst_sel_w = desc->swizzle[3];
2795		vtx.data_format = format;
2796		vtx.num_format_all = num_format;
2797		vtx.format_comp_all = format_comp;
2798		vtx.srf_mode_all = 1;
2799		vtx.offset = elements[i].src_offset;
2800		vtx.endian = endian;
2801
2802		if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
2803			r600_bytecode_clear(&bc);
2804			return r;
2805		}
2806	}
2807
2808	r600_bytecode_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
2809
2810	if ((r = r600_bytecode_build(&bc))) {
2811		r600_bytecode_clear(&bc);
2812		return r;
2813	}
2814
2815	if (dump_shaders == -1)
2816		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
2817
2818	if (dump_shaders) {
2819		fprintf(stderr, "--------------------------------------------------------------\n");
2820		r600_bytecode_dump(&bc);
2821		fprintf(stderr, "______________________________________________________________\n");
2822	}
2823
2824	ve->fs_size = bc.ndw*4;
2825
2826	ve->fetch_shader = (struct r600_resource*)
2827			pipe_buffer_create(rctx->context.screen,
2828					   PIPE_BIND_CUSTOM,
2829					   PIPE_USAGE_IMMUTABLE, ve->fs_size);
2830	if (ve->fetch_shader == NULL) {
2831		r600_bytecode_clear(&bc);
2832		return -ENOMEM;
2833	}
2834
2835	bytecode = rctx->ws->buffer_map(ve->fetch_shader->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
2836	if (bytecode == NULL) {
2837		r600_bytecode_clear(&bc);
2838		pipe_resource_reference((struct pipe_resource**)&ve->fetch_shader, NULL);
2839		return -ENOMEM;
2840	}
2841
2842	if (R600_BIG_ENDIAN) {
2843		for (i = 0; i < ve->fs_size / 4; ++i) {
2844			bytecode[i] = bswap_32(bc.bytecode[i]);
2845		}
2846	} else {
2847		memcpy(bytecode, bc.bytecode, ve->fs_size);
2848	}
2849
2850	rctx->ws->buffer_unmap(ve->fetch_shader->cs_buf);
2851	r600_bytecode_clear(&bc);
2852
2853	if (rctx->chip_class >= EVERGREEN)
2854		evergreen_fetch_shader(&rctx->context, ve);
2855	else
2856		r600_fetch_shader(&rctx->context, ve);
2857
2858	return 0;
2859}
2860