r600_shader.c revision a6fe055fa77e42e35f25272bdd5ca7213b436a1a
1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_llvm.h" 25#include "r600_formats.h" 26#include "r600_opcodes.h" 27#include "r600_shader.h" 28#include "r600d.h" 29 30#include "sb/sb_public.h" 31 32#include "pipe/p_shader_tokens.h" 33#include "tgsi/tgsi_info.h" 34#include "tgsi/tgsi_parse.h" 35#include "tgsi/tgsi_scan.h" 36#include "tgsi/tgsi_dump.h" 37#include "util/u_memory.h" 38#include <stdio.h> 39#include <errno.h> 40#include <byteswap.h> 41 42/* CAYMAN notes 43Why CAYMAN got loops for lots of instructions is explained here. 44 45-These 8xx t-slot only ops are implemented in all vector slots. 46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 47These 8xx t-slot only opcodes become vector ops, with all four 48slots expecting the arguments on sources a and b. Result is 49broadcast to all channels. 50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT 51These 8xx t-slot only opcodes become vector ops in the z, y, and 52x slots. 53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 55SQRT_IEEE/_64 56SIN/COS 57The w slot may have an independent co-issued operation, or if the 58result is required to be in the w slot, the opcode above may be 59issued in the w slot as well. 60The compiler must issue the source argument to slots z, y, and x 61*/ 62 63static int r600_shader_from_tgsi(struct r600_screen *rscreen, 64 struct r600_pipe_shader *pipeshader, 65 struct r600_shader_key key); 66 67static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 68 int size, unsigned comp_mask) { 69 70 if (!size) 71 return; 72 73 if (ps->num_arrays == ps->max_arrays) { 74 ps->max_arrays += 64; 75 ps->arrays = realloc(ps->arrays, ps->max_arrays * 76 sizeof(struct r600_shader_array)); 77 } 78 79 int n = ps->num_arrays; 80 ++ps->num_arrays; 81 82 ps->arrays[n].comp_mask = comp_mask; 83 ps->arrays[n].gpr_start = start_gpr; 84 ps->arrays[n].gpr_count = size; 85} 86 87static unsigned tgsi_get_processor_type(const struct tgsi_token *tokens) 88{ 89 struct tgsi_parse_context parse; 90 91 if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK) { 92 debug_printf("tgsi_parse_init() failed in %s:%i!\n", __func__, __LINE__); 93 return ~0; 94 } 95 return parse.FullHeader.Processor.Processor; 96} 97 98static bool r600_can_dump_shader(struct r600_screen *rscreen, unsigned processor_type) 99{ 100 switch (processor_type) { 101 case TGSI_PROCESSOR_VERTEX: 102 return (rscreen->debug_flags & DBG_VS) != 0; 103 case TGSI_PROCESSOR_GEOMETRY: 104 return (rscreen->debug_flags & DBG_GS) != 0; 105 case TGSI_PROCESSOR_FRAGMENT: 106 return (rscreen->debug_flags & DBG_PS) != 0; 107 case TGSI_PROCESSOR_COMPUTE: 108 return (rscreen->debug_flags & DBG_CS) != 0; 109 default: 110 return false; 111 } 112} 113 114static void r600_dump_streamout(struct pipe_stream_output_info *so) 115{ 116 unsigned i; 117 118 fprintf(stderr, "STREAMOUT\n"); 119 for (i = 0; i < so->num_outputs; i++) { 120 unsigned mask = ((1 << so->output[i].num_components) - 1) << 121 so->output[i].start_component; 122 fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 123 i, so->output[i].output_buffer, 124 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 125 so->output[i].register_index, 126 mask & 1 ? "x" : "", 127 mask & 2 ? "y" : "", 128 mask & 4 ? "z" : "", 129 mask & 8 ? "w" : "", 130 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 131 } 132} 133 134int r600_pipe_shader_create(struct pipe_context *ctx, 135 struct r600_pipe_shader *shader, 136 struct r600_shader_key key) 137{ 138 struct r600_context *rctx = (struct r600_context *)ctx; 139 struct r600_pipe_shader_selector *sel = shader->selector; 140 int r, i; 141 uint32_t *ptr; 142 bool dump = r600_can_dump_shader(rctx->screen, tgsi_get_processor_type(sel->tokens)); 143 unsigned use_sb = rctx->screen->debug_flags & DBG_SB; 144 unsigned sb_disasm = use_sb || (rctx->screen->debug_flags & DBG_SB_DISASM); 145 146 shader->shader.bc.isa = rctx->isa; 147 148 if (dump) { 149 fprintf(stderr, "--------------------------------------------------------------\n"); 150 tgsi_dump(sel->tokens, 0); 151 152 if (sel->so.num_outputs) { 153 r600_dump_streamout(&sel->so); 154 } 155 } 156 r = r600_shader_from_tgsi(rctx->screen, shader, key); 157 if (r) { 158 R600_ERR("translation from TGSI failed !\n"); 159 return r; 160 } 161 r = r600_bytecode_build(&shader->shader.bc); 162 if (r) { 163 R600_ERR("building bytecode failed !\n"); 164 return r; 165 } 166 167 if (dump && !sb_disasm) { 168 fprintf(stderr, "--------------------------------------------------------------\n"); 169 r600_bytecode_disasm(&shader->shader.bc); 170 fprintf(stderr, "______________________________________________________________\n"); 171 } else if ((dump && sb_disasm) || use_sb) { 172 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 173 dump, use_sb); 174 if (r) { 175 R600_ERR("r600_sb_bytecode_process failed !\n"); 176 return r; 177 } 178 } 179 180 /* Store the shader in a buffer. */ 181 if (shader->bo == NULL) { 182 shader->bo = (struct r600_resource*) 183 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 184 if (shader->bo == NULL) { 185 return -ENOMEM; 186 } 187 ptr = r600_buffer_mmap_sync_with_rings(rctx, shader->bo, PIPE_TRANSFER_WRITE); 188 if (R600_BIG_ENDIAN) { 189 for (i = 0; i < shader->shader.bc.ndw; ++i) { 190 ptr[i] = bswap_32(shader->shader.bc.bytecode[i]); 191 } 192 } else { 193 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 194 } 195 rctx->ws->buffer_unmap(shader->bo->cs_buf); 196 } 197 198 /* Build state. */ 199 switch (shader->shader.processor_type) { 200 case TGSI_PROCESSOR_VERTEX: 201 if (rctx->chip_class >= EVERGREEN) { 202 evergreen_update_vs_state(ctx, shader); 203 } else { 204 r600_update_vs_state(ctx, shader); 205 } 206 break; 207 case TGSI_PROCESSOR_FRAGMENT: 208 if (rctx->chip_class >= EVERGREEN) { 209 evergreen_update_ps_state(ctx, shader); 210 } else { 211 r600_update_ps_state(ctx, shader); 212 } 213 break; 214 default: 215 return -EINVAL; 216 } 217 return 0; 218} 219 220void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader) 221{ 222 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL); 223 r600_bytecode_clear(&shader->shader.bc); 224 r600_release_command_buffer(&shader->command_buffer); 225} 226 227/* 228 * tgsi -> r600 shader 229 */ 230struct r600_shader_tgsi_instruction; 231 232struct r600_shader_src { 233 unsigned sel; 234 unsigned swizzle[4]; 235 unsigned neg; 236 unsigned abs; 237 unsigned rel; 238 unsigned kc_bank; 239 uint32_t value[4]; 240}; 241 242struct r600_shader_ctx { 243 struct tgsi_shader_info info; 244 struct tgsi_parse_context parse; 245 const struct tgsi_token *tokens; 246 unsigned type; 247 unsigned file_offset[TGSI_FILE_COUNT]; 248 unsigned temp_reg; 249 struct r600_shader_tgsi_instruction *inst_info; 250 struct r600_bytecode *bc; 251 struct r600_shader *shader; 252 struct r600_shader_src src[4]; 253 uint32_t *literals; 254 uint32_t nliterals; 255 uint32_t max_driver_temp_used; 256 boolean use_llvm; 257 /* needed for evergreen interpolation */ 258 boolean input_centroid; 259 boolean input_linear; 260 boolean input_perspective; 261 int num_interp_gpr; 262 int face_gpr; 263 int colors_used; 264 boolean clip_vertex_write; 265 unsigned cv_output; 266 int fragcoord_input; 267 int native_integers; 268}; 269 270struct r600_shader_tgsi_instruction { 271 unsigned tgsi_opcode; 272 unsigned is_op3; 273 unsigned op; 274 int (*process)(struct r600_shader_ctx *ctx); 275}; 276 277static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 278static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 279static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 280static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 281static int tgsi_else(struct r600_shader_ctx *ctx); 282static int tgsi_endif(struct r600_shader_ctx *ctx); 283static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 284static int tgsi_endloop(struct r600_shader_ctx *ctx); 285static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 286 287/* 288 * bytestream -> r600 shader 289 * 290 * These functions are used to transform the output of the LLVM backend into 291 * struct r600_bytecode. 292 */ 293 294static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx, 295 unsigned char * bytes, unsigned num_bytes); 296 297#ifdef HAVE_OPENCL 298int r600_compute_shader_create(struct pipe_context * ctx, 299 LLVMModuleRef mod, struct r600_bytecode * bytecode) 300{ 301 struct r600_context *r600_ctx = (struct r600_context *)ctx; 302 unsigned char * bytes; 303 unsigned byte_count; 304 struct r600_shader_ctx shader_ctx; 305 boolean use_kill = false; 306 bool dump = (r600_ctx->screen->debug_flags & DBG_CS) != 0; 307 unsigned use_sb = r600_ctx->screen->debug_flags & DBG_SB_CS; 308 unsigned sb_disasm = use_sb || 309 (r600_ctx->screen->debug_flags & DBG_SB_DISASM); 310 311 shader_ctx.bc = bytecode; 312 r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family, 313 r600_ctx->screen->msaa_texture_support); 314 shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE; 315 shader_ctx.bc->isa = r600_ctx->isa; 316 r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family, 317 shader_ctx.bc, &use_kill, dump); 318 r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count); 319 if (shader_ctx.bc->chip_class == CAYMAN) { 320 cm_bytecode_add_cf_end(shader_ctx.bc); 321 } 322 r600_bytecode_build(shader_ctx.bc); 323 324 if (dump && !sb_disasm) { 325 r600_bytecode_disasm(shader_ctx.bc); 326 } else if ((dump && sb_disasm) || use_sb) { 327 if (r600_sb_bytecode_process(r600_ctx, shader_ctx.bc, NULL, dump, use_sb)) 328 R600_ERR("r600_sb_bytecode_process failed!\n"); 329 } 330 331 free(bytes); 332 return 1; 333} 334 335#endif /* HAVE_OPENCL */ 336 337static uint32_t i32_from_byte_stream(unsigned char * bytes, 338 unsigned * bytes_read) 339{ 340 unsigned i; 341 uint32_t out = 0; 342 for (i = 0; i < 4; i++) { 343 out |= bytes[(*bytes_read)++] << (8 * i); 344 } 345 return out; 346} 347 348static unsigned r600_src_from_byte_stream(unsigned char * bytes, 349 unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx) 350{ 351 unsigned i; 352 unsigned sel0, sel1; 353 sel0 = bytes[bytes_read++]; 354 sel1 = bytes[bytes_read++]; 355 alu->src[src_idx].sel = sel0 | (sel1 << 8); 356 alu->src[src_idx].chan = bytes[bytes_read++]; 357 alu->src[src_idx].neg = bytes[bytes_read++]; 358 alu->src[src_idx].abs = bytes[bytes_read++]; 359 alu->src[src_idx].rel = bytes[bytes_read++]; 360 alu->src[src_idx].kc_bank = bytes[bytes_read++]; 361 for (i = 0; i < 4; i++) { 362 alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8); 363 } 364 return bytes_read; 365} 366 367static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx, 368 unsigned char * bytes, unsigned bytes_read) 369{ 370 unsigned src_idx, src_num; 371 struct r600_bytecode_alu alu; 372 unsigned src_use_sel[3]; 373 const struct alu_op_info *alu_op; 374 unsigned src_sel[3] = {}; 375 uint32_t word0, word1; 376 377 src_num = bytes[bytes_read++]; 378 379 memset(&alu, 0, sizeof(alu)); 380 for(src_idx = 0; src_idx < src_num; src_idx++) { 381 unsigned i; 382 src_use_sel[src_idx] = bytes[bytes_read++]; 383 for (i = 0; i < 4; i++) { 384 src_sel[src_idx] |= bytes[bytes_read++] << (i * 8); 385 } 386 for (i = 0; i < 4; i++) { 387 alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8); 388 } 389 } 390 391 word0 = i32_from_byte_stream(bytes, &bytes_read); 392 word1 = i32_from_byte_stream(bytes, &bytes_read); 393 394 switch(ctx->bc->chip_class) { 395 default: 396 case R600: 397 r600_bytecode_alu_read(ctx->bc, &alu, word0, word1); 398 break; 399 case R700: 400 case EVERGREEN: 401 case CAYMAN: 402 r700_bytecode_alu_read(ctx->bc, &alu, word0, word1); 403 break; 404 } 405 406 for(src_idx = 0; src_idx < src_num; src_idx++) { 407 if (src_use_sel[src_idx]) { 408 unsigned sel = src_sel[src_idx]; 409 410 alu.src[src_idx].chan = sel & 3; 411 sel >>= 2; 412 413 if (sel>=512) { /* constant */ 414 sel -= 512; 415 alu.src[src_idx].kc_bank = sel >> 12; 416 alu.src[src_idx].sel = (sel & 4095) + 512; 417 } 418 else { 419 alu.src[src_idx].sel = sel; 420 } 421 } 422 } 423 424 alu_op = r600_isa_alu(alu.op); 425 426#if HAVE_LLVM < 0x0302 427 if ((alu_op->flags & AF_PRED) && alu_op->src_count == 2) { 428 alu.update_pred = 1; 429 alu.dst.write = 0; 430 alu.src[1].sel = V_SQ_ALU_SRC_0; 431 alu.src[1].chan = 0; 432 alu.last = 1; 433 } 434#endif 435 436 if (alu_op->flags & AF_MOVA) { 437 ctx->bc->ar_reg = alu.src[0].sel; 438 ctx->bc->ar_chan = alu.src[0].chan; 439 ctx->bc->ar_loaded = 0; 440 return bytes_read; 441 } 442 443 r600_bytecode_add_alu_type(ctx->bc, &alu, ctx->bc->cf_last->op); 444 445 /* XXX: Handle other KILL instructions */ 446 if (alu_op->flags & AF_KILL) { 447 ctx->shader->uses_kill = 1; 448 /* XXX: This should be enforced in the LLVM backend. */ 449 ctx->bc->force_add_cf = 1; 450 } 451 return bytes_read; 452} 453 454static void llvm_if(struct r600_shader_ctx *ctx) 455{ 456 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 457 fc_pushlevel(ctx, FC_IF); 458 callstack_push(ctx, FC_PUSH_VPM); 459} 460 461static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx) 462{ 463 unsigned opcode = TGSI_OPCODE_BRK; 464 if (ctx->bc->chip_class == CAYMAN) 465 ctx->inst_info = &cm_shader_tgsi_instruction[opcode]; 466 else if (ctx->bc->chip_class >= EVERGREEN) 467 ctx->inst_info = &eg_shader_tgsi_instruction[opcode]; 468 else 469 ctx->inst_info = &r600_shader_tgsi_instruction[opcode]; 470 llvm_if(ctx); 471 tgsi_loop_brk_cont(ctx); 472 tgsi_endif(ctx); 473} 474 475static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx, 476 unsigned char * bytes, unsigned bytes_read) 477{ 478 struct r600_bytecode_alu alu; 479 unsigned inst; 480 memset(&alu, 0, sizeof(alu)); 481 bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0); 482 inst = bytes[bytes_read++]; 483 switch (inst) { 484 case 0: /* IF_PREDICATED */ 485 llvm_if(ctx); 486 break; 487 case 1: /* ELSE */ 488 tgsi_else(ctx); 489 break; 490 case 2: /* ENDIF */ 491 tgsi_endif(ctx); 492 break; 493 case 3: /* BGNLOOP */ 494 tgsi_bgnloop(ctx); 495 break; 496 case 4: /* ENDLOOP */ 497 tgsi_endloop(ctx); 498 break; 499 case 5: /* PREDICATED_BREAK */ 500 r600_break_from_byte_stream(ctx); 501 break; 502 case 6: /* CONTINUE */ 503 { 504 unsigned opcode = TGSI_OPCODE_CONT; 505 if (ctx->bc->chip_class == CAYMAN) { 506 ctx->inst_info = 507 &cm_shader_tgsi_instruction[opcode]; 508 } else if (ctx->bc->chip_class >= EVERGREEN) { 509 ctx->inst_info = 510 &eg_shader_tgsi_instruction[opcode]; 511 } else { 512 ctx->inst_info = 513 &r600_shader_tgsi_instruction[opcode]; 514 } 515 tgsi_loop_brk_cont(ctx); 516 } 517 break; 518 } 519 520 return bytes_read; 521} 522 523static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx, 524 unsigned char * bytes, unsigned bytes_read) 525{ 526 struct r600_bytecode_tex tex; 527 528 uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read); 529 uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read); 530 uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read); 531 532 tex.op = r600_isa_fetch_by_opcode(ctx->bc->isa, G_SQ_TEX_WORD0_TEX_INST(word0)); 533 tex.resource_id = G_SQ_TEX_WORD0_RESOURCE_ID(word0); 534 tex.src_gpr = G_SQ_TEX_WORD0_SRC_GPR(word0); 535 tex.src_rel = G_SQ_TEX_WORD0_SRC_REL(word0); 536 tex.dst_gpr = G_SQ_TEX_WORD1_DST_GPR(word1); 537 tex.dst_rel = G_SQ_TEX_WORD1_DST_REL(word1); 538 tex.dst_sel_x = G_SQ_TEX_WORD1_DST_SEL_X(word1); 539 tex.dst_sel_y = G_SQ_TEX_WORD1_DST_SEL_Y(word1); 540 tex.dst_sel_z = G_SQ_TEX_WORD1_DST_SEL_Z(word1); 541 tex.dst_sel_w = G_SQ_TEX_WORD1_DST_SEL_W(word1); 542 tex.lod_bias = G_SQ_TEX_WORD1_LOD_BIAS(word1); 543 tex.coord_type_x = G_SQ_TEX_WORD1_COORD_TYPE_X(word1); 544 tex.coord_type_y = G_SQ_TEX_WORD1_COORD_TYPE_Y(word1); 545 tex.coord_type_z = G_SQ_TEX_WORD1_COORD_TYPE_Z(word1); 546 tex.coord_type_w = G_SQ_TEX_WORD1_COORD_TYPE_W(word1); 547 tex.offset_x = G_SQ_TEX_WORD2_OFFSET_X(word2); 548 tex.offset_y = G_SQ_TEX_WORD2_OFFSET_Y(word2); 549 tex.offset_z = G_SQ_TEX_WORD2_OFFSET_Z(word2); 550 tex.sampler_id = G_SQ_TEX_WORD2_SAMPLER_ID(word2); 551 tex.src_sel_x = G_SQ_TEX_WORD2_SRC_SEL_X(word2); 552 tex.src_sel_y = G_SQ_TEX_WORD2_SRC_SEL_Y(word2); 553 tex.src_sel_z = G_SQ_TEX_WORD2_SRC_SEL_Z(word2); 554 tex.src_sel_w = G_SQ_TEX_WORD2_SRC_SEL_W(word2); 555 tex.offset_x <<= 1; 556 tex.offset_y <<= 1; 557 tex.offset_z <<= 1; 558 559 tex.inst_mod = 0; 560 561 r600_bytecode_add_tex(ctx->bc, &tex); 562 563 return bytes_read; 564} 565 566static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx, 567 unsigned char * bytes, unsigned bytes_read) 568{ 569 struct r600_bytecode_vtx vtx; 570 571 uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read); 572 uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read); 573 uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read); 574 575 memset(&vtx, 0, sizeof(vtx)); 576 577 /* WORD0 */ 578 vtx.op = r600_isa_fetch_by_opcode(ctx->bc->isa, 579 G_SQ_VTX_WORD0_VTX_INST(word0)); 580 vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0); 581 vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0); 582 vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0); 583 vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0); 584 vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0); 585 586 /* WORD1 */ 587 vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1); 588 vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1); 589 vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1); 590 vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1); 591 vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1); 592 vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1); 593 vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1); 594 vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1); 595 vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1); 596 vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1); 597 598 /* WORD 2*/ 599 vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2); 600 vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2); 601 602 if (r600_bytecode_add_vtx(ctx->bc, &vtx)) { 603 fprintf(stderr, "Error adding vtx\n"); 604 } 605 606 /* Use the Texture Cache for compute shaders*/ 607 if (ctx->bc->chip_class >= EVERGREEN && 608 ctx->bc->type == TGSI_PROCESSOR_COMPUTE) { 609 ctx->bc->cf_last->op = CF_OP_TEX; 610 } 611 return bytes_read; 612} 613 614static int r600_export_from_byte_stream(struct r600_shader_ctx *ctx, 615 unsigned char * bytes, unsigned bytes_read) 616{ 617 uint32_t word0 = 0, word1 = 0; 618 struct r600_bytecode_output output; 619 memset(&output, 0, sizeof(struct r600_bytecode_output)); 620 word0 = i32_from_byte_stream(bytes, &bytes_read); 621 word1 = i32_from_byte_stream(bytes, &bytes_read); 622 if (ctx->bc->chip_class >= EVERGREEN) 623 eg_bytecode_export_read(ctx->bc, &output, word0,word1); 624 else 625 r600_bytecode_export_read(ctx->bc, &output, word0,word1); 626 r600_bytecode_add_output(ctx->bc, &output); 627 return bytes_read; 628} 629 630static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx, 631 unsigned char * bytes, unsigned num_bytes) 632{ 633 unsigned bytes_read = 0; 634 unsigned i, byte; 635 while (bytes_read < num_bytes) { 636 char inst_type = bytes[bytes_read++]; 637 switch (inst_type) { 638 case 0: 639 bytes_read = r600_alu_from_byte_stream(ctx, bytes, 640 bytes_read); 641 break; 642 case 1: 643 bytes_read = r600_tex_from_byte_stream(ctx, bytes, 644 bytes_read); 645 break; 646 case 2: 647 bytes_read = r600_fc_from_byte_stream(ctx, bytes, 648 bytes_read); 649 break; 650 case 3: 651 r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE); 652 for (i = 0; i < 2; i++) { 653 for (byte = 0 ; byte < 4; byte++) { 654 ctx->bc->cf_last->isa[i] |= 655 (bytes[bytes_read++] << (byte * 8)); 656 } 657 } 658 break; 659 660 case 4: 661 bytes_read = r600_vtx_from_byte_stream(ctx, bytes, 662 bytes_read); 663 break; 664 case 5: 665 bytes_read = r600_export_from_byte_stream(ctx, bytes, 666 bytes_read); 667 break; 668 case 6: { 669 int32_t word0 = i32_from_byte_stream(bytes, &bytes_read); 670 int32_t word1 = i32_from_byte_stream(bytes, &bytes_read); 671 672 r600_bytecode_add_cf(ctx->bc); 673 ctx->bc->cf_last->op = r600_isa_cf_by_opcode(ctx->bc->isa, G_SQ_CF_ALU_WORD1_CF_INST(word1), 1); 674 ctx->bc->cf_last->kcache[0].bank = G_SQ_CF_ALU_WORD0_KCACHE_BANK0(word0); 675 ctx->bc->cf_last->kcache[0].addr = G_SQ_CF_ALU_WORD1_KCACHE_ADDR0(word1); 676 ctx->bc->cf_last->kcache[0].mode = G_SQ_CF_ALU_WORD0_KCACHE_MODE0(word0); 677 ctx->bc->cf_last->kcache[1].bank = G_SQ_CF_ALU_WORD0_KCACHE_BANK1(word0); 678 ctx->bc->cf_last->kcache[1].addr = G_SQ_CF_ALU_WORD1_KCACHE_ADDR1(word1); 679 ctx->bc->cf_last->kcache[1].mode = G_SQ_CF_ALU_WORD1_KCACHE_MODE1(word1); 680 break; 681 } 682 default: 683 /* XXX: Error here */ 684 break; 685 } 686 } 687} 688 689/* End bytestream -> r600 shader functions*/ 690 691static int tgsi_is_supported(struct r600_shader_ctx *ctx) 692{ 693 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 694 int j; 695 696 if (i->Instruction.NumDstRegs > 1) { 697 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 698 return -EINVAL; 699 } 700 if (i->Instruction.Predicate) { 701 R600_ERR("predicate unsupported\n"); 702 return -EINVAL; 703 } 704#if 0 705 if (i->Instruction.Label) { 706 R600_ERR("label unsupported\n"); 707 return -EINVAL; 708 } 709#endif 710 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 711 if (i->Src[j].Register.Dimension) { 712 if (i->Src[j].Register.File != TGSI_FILE_CONSTANT) { 713 R600_ERR("unsupported src %d (dimension %d)\n", j, 714 i->Src[j].Register.Dimension); 715 return -EINVAL; 716 } 717 } 718 } 719 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 720 if (i->Dst[j].Register.Dimension) { 721 R600_ERR("unsupported dst (dimension)\n"); 722 return -EINVAL; 723 } 724 } 725 return 0; 726} 727 728static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 729 int input) 730{ 731 int ij_index = 0; 732 733 if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) { 734 if (ctx->shader->input[input].centroid) 735 ij_index++; 736 } else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) { 737 /* if we have perspective add one */ 738 if (ctx->input_perspective) { 739 ij_index++; 740 /* if we have perspective centroid */ 741 if (ctx->input_centroid) 742 ij_index++; 743 } 744 if (ctx->shader->input[input].centroid) 745 ij_index++; 746 } 747 748 ctx->shader->input[input].ij_index = ij_index; 749} 750 751static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 752{ 753 int i, r; 754 struct r600_bytecode_alu alu; 755 int gpr = 0, base_chan = 0; 756 int ij_index = ctx->shader->input[input].ij_index; 757 758 /* work out gpr and base_chan from index */ 759 gpr = ij_index / 2; 760 base_chan = (2 * (ij_index % 2)) + 1; 761 762 for (i = 0; i < 8; i++) { 763 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 764 765 if (i < 4) 766 alu.op = ALU_OP2_INTERP_ZW; 767 else 768 alu.op = ALU_OP2_INTERP_XY; 769 770 if ((i > 1) && (i < 6)) { 771 alu.dst.sel = ctx->shader->input[input].gpr; 772 alu.dst.write = 1; 773 } 774 775 alu.dst.chan = i % 4; 776 777 alu.src[0].sel = gpr; 778 alu.src[0].chan = (base_chan - (i % 2)); 779 780 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 781 782 alu.bank_swizzle_force = SQ_ALU_VEC_210; 783 if ((i % 4) == 3) 784 alu.last = 1; 785 r = r600_bytecode_add_alu(ctx->bc, &alu); 786 if (r) 787 return r; 788 } 789 return 0; 790} 791 792static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 793{ 794 int i, r; 795 struct r600_bytecode_alu alu; 796 797 for (i = 0; i < 4; i++) { 798 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 799 800 alu.op = ALU_OP1_INTERP_LOAD_P0; 801 802 alu.dst.sel = ctx->shader->input[input].gpr; 803 alu.dst.write = 1; 804 805 alu.dst.chan = i; 806 807 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 808 alu.src[0].chan = i; 809 810 if (i == 3) 811 alu.last = 1; 812 r = r600_bytecode_add_alu(ctx->bc, &alu); 813 if (r) 814 return r; 815 } 816 return 0; 817} 818 819/* 820 * Special export handling in shaders 821 * 822 * shader export ARRAY_BASE for EXPORT_POS: 823 * 60 is position 824 * 61 is misc vector 825 * 62, 63 are clip distance vectors 826 * 827 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 828 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 829 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 830 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 831 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 832 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 833 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 834 * exclusive from render target index) 835 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 836 * 837 * 838 * shader export ARRAY_BASE for EXPORT_PIXEL: 839 * 0-7 CB targets 840 * 61 computed Z vector 841 * 842 * The use of the values exported in the computed Z vector are controlled 843 * by DB_SHADER_CONTROL: 844 * Z_EXPORT_ENABLE - Z as a float in RED 845 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 846 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 847 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 848 * DB_SOURCE_FORMAT - export control restrictions 849 * 850 */ 851 852 853/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 854static int r600_spi_sid(struct r600_shader_io * io) 855{ 856 int index, name = io->name; 857 858 /* These params are handled differently, they don't need 859 * semantic indices, so we'll use 0 for them. 860 */ 861 if (name == TGSI_SEMANTIC_POSITION || 862 name == TGSI_SEMANTIC_PSIZE || 863 name == TGSI_SEMANTIC_FACE) 864 index = 0; 865 else { 866 if (name == TGSI_SEMANTIC_GENERIC) { 867 /* For generic params simply use sid from tgsi */ 868 index = io->sid; 869 } else { 870 /* For non-generic params - pack name and sid into 8 bits */ 871 index = 0x80 | (name<<3) | (io->sid); 872 } 873 874 /* Make sure that all really used indices have nonzero value, so 875 * we can just compare it to 0 later instead of comparing the name 876 * with different values to detect special cases. */ 877 index++; 878 } 879 880 return index; 881}; 882 883/* turn input into interpolate on EG */ 884static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 885{ 886 int r = 0; 887 888 if (ctx->shader->input[index].spi_sid) { 889 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 890 if (ctx->shader->input[index].interpolate > 0) { 891 evergreen_interp_assign_ij_index(ctx, index); 892 if (!ctx->use_llvm) 893 r = evergreen_interp_alu(ctx, index); 894 } else { 895 if (!ctx->use_llvm) 896 r = evergreen_interp_flat(ctx, index); 897 } 898 } 899 return r; 900} 901 902static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 903{ 904 struct r600_bytecode_alu alu; 905 int i, r; 906 int gpr_front = ctx->shader->input[front].gpr; 907 int gpr_back = ctx->shader->input[back].gpr; 908 909 for (i = 0; i < 4; i++) { 910 memset(&alu, 0, sizeof(alu)); 911 alu.op = ALU_OP3_CNDGT; 912 alu.is_op3 = 1; 913 alu.dst.write = 1; 914 alu.dst.sel = gpr_front; 915 alu.src[0].sel = ctx->face_gpr; 916 alu.src[1].sel = gpr_front; 917 alu.src[2].sel = gpr_back; 918 919 alu.dst.chan = i; 920 alu.src[1].chan = i; 921 alu.src[2].chan = i; 922 alu.last = (i==3); 923 924 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 925 return r; 926 } 927 928 return 0; 929} 930 931static int tgsi_declaration(struct r600_shader_ctx *ctx) 932{ 933 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 934 int r, i, j, count = d->Range.Last - d->Range.First + 1; 935 936 switch (d->Declaration.File) { 937 case TGSI_FILE_INPUT: 938 i = ctx->shader->ninput; 939 ctx->shader->ninput += count; 940 ctx->shader->input[i].name = d->Semantic.Name; 941 ctx->shader->input[i].sid = d->Semantic.Index; 942 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 943 ctx->shader->input[i].centroid = d->Interp.Centroid; 944 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First; 945 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 946 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 947 switch (ctx->shader->input[i].name) { 948 case TGSI_SEMANTIC_FACE: 949 ctx->face_gpr = ctx->shader->input[i].gpr; 950 break; 951 case TGSI_SEMANTIC_COLOR: 952 ctx->colors_used++; 953 break; 954 case TGSI_SEMANTIC_POSITION: 955 ctx->fragcoord_input = i; 956 break; 957 } 958 if (ctx->bc->chip_class >= EVERGREEN) { 959 if ((r = evergreen_interp_input(ctx, i))) 960 return r; 961 } 962 } 963 for (j = 1; j < count; ++j) { 964 ctx->shader->input[i + j] = ctx->shader->input[i]; 965 ctx->shader->input[i + j].gpr += j; 966 } 967 break; 968 case TGSI_FILE_OUTPUT: 969 i = ctx->shader->noutput++; 970 ctx->shader->output[i].name = d->Semantic.Name; 971 ctx->shader->output[i].sid = d->Semantic.Index; 972 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First; 973 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 974 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 975 if (ctx->type == TGSI_PROCESSOR_VERTEX) { 976 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 977 switch (d->Semantic.Name) { 978 case TGSI_SEMANTIC_CLIPDIST: 979 ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2); 980 break; 981 case TGSI_SEMANTIC_PSIZE: 982 ctx->shader->vs_out_misc_write = 1; 983 ctx->shader->vs_out_point_size = 1; 984 break; 985 case TGSI_SEMANTIC_CLIPVERTEX: 986 ctx->clip_vertex_write = TRUE; 987 ctx->cv_output = i; 988 break; 989 } 990 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 991 switch (d->Semantic.Name) { 992 case TGSI_SEMANTIC_COLOR: 993 ctx->shader->nr_ps_max_color_exports++; 994 break; 995 } 996 } 997 break; 998 case TGSI_FILE_TEMPORARY: 999 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 1000 if (d->Array.ArrayID) { 1001 r600_add_gpr_array(ctx->shader, 1002 ctx->file_offset[TGSI_FILE_TEMPORARY] + 1003 d->Range.First, 1004 d->Range.Last - d->Range.First + 1, 0x0F); 1005 } 1006 } 1007 break; 1008 1009 case TGSI_FILE_CONSTANT: 1010 case TGSI_FILE_SAMPLER: 1011 case TGSI_FILE_ADDRESS: 1012 break; 1013 1014 case TGSI_FILE_SYSTEM_VALUE: 1015 if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 1016 if (!ctx->native_integers) { 1017 struct r600_bytecode_alu alu; 1018 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1019 1020 alu.op = ALU_OP1_INT_TO_FLT; 1021 alu.src[0].sel = 0; 1022 alu.src[0].chan = 3; 1023 1024 alu.dst.sel = 0; 1025 alu.dst.chan = 3; 1026 alu.dst.write = 1; 1027 alu.last = 1; 1028 1029 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1030 return r; 1031 } 1032 break; 1033 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 1034 break; 1035 default: 1036 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 1037 return -EINVAL; 1038 } 1039 return 0; 1040} 1041 1042static int r600_get_temp(struct r600_shader_ctx *ctx) 1043{ 1044 return ctx->temp_reg + ctx->max_driver_temp_used++; 1045} 1046 1047/* 1048 * for evergreen we need to scan the shader to find the number of GPRs we need to 1049 * reserve for interpolation. 1050 * 1051 * we need to know if we are going to emit 1052 * any centroid inputs 1053 * if perspective and linear are required 1054*/ 1055static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 1056{ 1057 int i; 1058 int num_baryc; 1059 1060 ctx->input_linear = FALSE; 1061 ctx->input_perspective = FALSE; 1062 ctx->input_centroid = FALSE; 1063 ctx->num_interp_gpr = 1; 1064 1065 /* any centroid inputs */ 1066 for (i = 0; i < ctx->info.num_inputs; i++) { 1067 /* skip position/face */ 1068 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 1069 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE) 1070 continue; 1071 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR) 1072 ctx->input_linear = TRUE; 1073 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE) 1074 ctx->input_perspective = TRUE; 1075 if (ctx->info.input_centroid[i]) 1076 ctx->input_centroid = TRUE; 1077 } 1078 1079 num_baryc = 0; 1080 /* ignoring sample for now */ 1081 if (ctx->input_perspective) 1082 num_baryc++; 1083 if (ctx->input_linear) 1084 num_baryc++; 1085 if (ctx->input_centroid) 1086 num_baryc *= 2; 1087 1088 ctx->num_interp_gpr += (num_baryc + 1) >> 1; 1089 1090 /* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */ 1091 return ctx->num_interp_gpr; 1092} 1093 1094static void tgsi_src(struct r600_shader_ctx *ctx, 1095 const struct tgsi_full_src_register *tgsi_src, 1096 struct r600_shader_src *r600_src) 1097{ 1098 memset(r600_src, 0, sizeof(*r600_src)); 1099 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 1100 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 1101 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 1102 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 1103 r600_src->neg = tgsi_src->Register.Negate; 1104 r600_src->abs = tgsi_src->Register.Absolute; 1105 1106 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 1107 int index; 1108 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 1109 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 1110 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 1111 1112 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 1113 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg); 1114 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 1115 return; 1116 } 1117 index = tgsi_src->Register.Index; 1118 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 1119 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 1120 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 1121 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 1122 r600_src->swizzle[0] = 3; 1123 r600_src->swizzle[1] = 3; 1124 r600_src->swizzle[2] = 3; 1125 r600_src->swizzle[3] = 3; 1126 r600_src->sel = 0; 1127 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 1128 r600_src->swizzle[0] = 0; 1129 r600_src->swizzle[1] = 0; 1130 r600_src->swizzle[2] = 0; 1131 r600_src->swizzle[3] = 0; 1132 r600_src->sel = 0; 1133 } 1134 } else { 1135 if (tgsi_src->Register.Indirect) 1136 r600_src->rel = V_SQ_REL_RELATIVE; 1137 r600_src->sel = tgsi_src->Register.Index; 1138 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 1139 } 1140 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 1141 if (tgsi_src->Register.Dimension) { 1142 r600_src->kc_bank = tgsi_src->Dimension.Index; 1143 } 1144 } 1145} 1146 1147static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int cb_idx, unsigned int offset, unsigned int dst_reg) 1148{ 1149 struct r600_bytecode_vtx vtx; 1150 unsigned int ar_reg; 1151 int r; 1152 1153 if (offset) { 1154 struct r600_bytecode_alu alu; 1155 1156 memset(&alu, 0, sizeof(alu)); 1157 1158 alu.op = ALU_OP2_ADD_INT; 1159 alu.src[0].sel = ctx->bc->ar_reg; 1160 1161 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1162 alu.src[1].value = offset; 1163 1164 alu.dst.sel = dst_reg; 1165 alu.dst.write = 1; 1166 alu.last = 1; 1167 1168 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1169 return r; 1170 1171 ar_reg = dst_reg; 1172 } else { 1173 ar_reg = ctx->bc->ar_reg; 1174 } 1175 1176 memset(&vtx, 0, sizeof(vtx)); 1177 vtx.buffer_id = cb_idx; 1178 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */ 1179 vtx.src_gpr = ar_reg; 1180 vtx.mega_fetch_count = 16; 1181 vtx.dst_gpr = dst_reg; 1182 vtx.dst_sel_x = 0; /* SEL_X */ 1183 vtx.dst_sel_y = 1; /* SEL_Y */ 1184 vtx.dst_sel_z = 2; /* SEL_Z */ 1185 vtx.dst_sel_w = 3; /* SEL_W */ 1186 vtx.data_format = FMT_32_32_32_32_FLOAT; 1187 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 1188 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 1189 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1190 vtx.endian = r600_endian_swap(32); 1191 1192 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1193 return r; 1194 1195 return 0; 1196} 1197 1198static int tgsi_split_constant(struct r600_shader_ctx *ctx) 1199{ 1200 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1201 struct r600_bytecode_alu alu; 1202 int i, j, k, nconst, r; 1203 1204 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 1205 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 1206 nconst++; 1207 } 1208 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 1209 } 1210 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 1211 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 1212 continue; 1213 } 1214 1215 if (ctx->src[i].rel) { 1216 int treg = r600_get_temp(ctx); 1217 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, treg))) 1218 return r; 1219 1220 ctx->src[i].kc_bank = 0; 1221 ctx->src[i].sel = treg; 1222 ctx->src[i].rel = 0; 1223 j--; 1224 } else if (j > 0) { 1225 int treg = r600_get_temp(ctx); 1226 for (k = 0; k < 4; k++) { 1227 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1228 alu.op = ALU_OP1_MOV; 1229 alu.src[0].sel = ctx->src[i].sel; 1230 alu.src[0].chan = k; 1231 alu.src[0].rel = ctx->src[i].rel; 1232 alu.dst.sel = treg; 1233 alu.dst.chan = k; 1234 alu.dst.write = 1; 1235 if (k == 3) 1236 alu.last = 1; 1237 r = r600_bytecode_add_alu(ctx->bc, &alu); 1238 if (r) 1239 return r; 1240 } 1241 ctx->src[i].sel = treg; 1242 ctx->src[i].rel =0; 1243 j--; 1244 } 1245 } 1246 return 0; 1247} 1248 1249/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 1250static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 1251{ 1252 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1253 struct r600_bytecode_alu alu; 1254 int i, j, k, nliteral, r; 1255 1256 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 1257 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 1258 nliteral++; 1259 } 1260 } 1261 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 1262 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 1263 int treg = r600_get_temp(ctx); 1264 for (k = 0; k < 4; k++) { 1265 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1266 alu.op = ALU_OP1_MOV; 1267 alu.src[0].sel = ctx->src[i].sel; 1268 alu.src[0].chan = k; 1269 alu.src[0].value = ctx->src[i].value[k]; 1270 alu.dst.sel = treg; 1271 alu.dst.chan = k; 1272 alu.dst.write = 1; 1273 if (k == 3) 1274 alu.last = 1; 1275 r = r600_bytecode_add_alu(ctx->bc, &alu); 1276 if (r) 1277 return r; 1278 } 1279 ctx->src[i].sel = treg; 1280 j--; 1281 } 1282 } 1283 return 0; 1284} 1285 1286static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 1287{ 1288 int i, r, count = ctx->shader->ninput; 1289 1290 for (i = 0; i < count; i++) { 1291 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 1292 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 1293 if (r) 1294 return r; 1295 } 1296 } 1297 return 0; 1298} 1299 1300 1301static int r600_shader_from_tgsi(struct r600_screen *rscreen, 1302 struct r600_pipe_shader *pipeshader, 1303 struct r600_shader_key key) 1304{ 1305 struct r600_shader *shader = &pipeshader->shader; 1306 struct tgsi_token *tokens = pipeshader->selector->tokens; 1307 struct pipe_stream_output_info so = pipeshader->selector->so; 1308 struct tgsi_full_immediate *immediate; 1309 struct tgsi_full_property *property; 1310 struct r600_shader_ctx ctx; 1311 struct r600_bytecode_output output[32]; 1312 unsigned output_done, noutput; 1313 unsigned opcode; 1314 int i, j, k, r = 0; 1315 int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0; 1316 /* Declarations used by llvm code */ 1317 bool use_llvm = false; 1318 unsigned char * inst_bytes = NULL; 1319 unsigned inst_byte_count = 0; 1320 bool indirect_gprs; 1321 1322#ifdef R600_USE_LLVM 1323 use_llvm = !(rscreen->debug_flags & DBG_NO_LLVM); 1324#endif 1325 ctx.bc = &shader->bc; 1326 ctx.shader = shader; 1327 ctx.native_integers = true; 1328 1329 r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family, 1330 rscreen->msaa_texture_support); 1331 ctx.tokens = tokens; 1332 tgsi_scan_shader(tokens, &ctx.info); 1333 shader->indirect_files = ctx.info.indirect_files; 1334 indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT); 1335 tgsi_parse_init(&ctx.parse, tokens); 1336 ctx.type = ctx.parse.FullHeader.Processor.Processor; 1337 shader->processor_type = ctx.type; 1338 ctx.bc->type = shader->processor_type; 1339 1340 ctx.face_gpr = -1; 1341 ctx.fragcoord_input = -1; 1342 ctx.colors_used = 0; 1343 ctx.clip_vertex_write = 0; 1344 1345 shader->nr_ps_color_exports = 0; 1346 shader->nr_ps_max_color_exports = 0; 1347 1348 shader->two_side = key.color_two_side; 1349 1350 /* register allocations */ 1351 /* Values [0,127] correspond to GPR[0..127]. 1352 * Values [128,159] correspond to constant buffer bank 0 1353 * Values [160,191] correspond to constant buffer bank 1 1354 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 1355 * Values [256,287] correspond to constant buffer bank 2 (EG) 1356 * Values [288,319] correspond to constant buffer bank 3 (EG) 1357 * Other special values are shown in the list below. 1358 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 1359 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 1360 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 1361 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 1362 * 248 SQ_ALU_SRC_0: special constant 0.0. 1363 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 1364 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 1365 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 1366 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 1367 * 253 SQ_ALU_SRC_LITERAL: literal constant. 1368 * 254 SQ_ALU_SRC_PV: previous vector result. 1369 * 255 SQ_ALU_SRC_PS: previous scalar result. 1370 */ 1371 for (i = 0; i < TGSI_FILE_COUNT; i++) { 1372 ctx.file_offset[i] = 0; 1373 } 1374 if (ctx.type == TGSI_PROCESSOR_VERTEX) { 1375 ctx.file_offset[TGSI_FILE_INPUT] = 1; 1376 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 1377 } 1378 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) { 1379 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 1380 } 1381 1382#ifdef R600_USE_LLVM 1383 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) { 1384 fprintf(stderr, "Warning: R600 LLVM backend does not support " 1385 "indirect adressing. Falling back to TGSI " 1386 "backend.\n"); 1387 use_llvm = 0; 1388 } 1389#endif 1390 ctx.use_llvm = use_llvm; 1391 1392 if (use_llvm) { 1393 ctx.file_offset[TGSI_FILE_OUTPUT] = 1394 ctx.file_offset[TGSI_FILE_INPUT]; 1395 } else { 1396 ctx.file_offset[TGSI_FILE_OUTPUT] = 1397 ctx.file_offset[TGSI_FILE_INPUT] + 1398 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 1399 } 1400 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 1401 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 1402 1403 /* Outside the GPR range. This will be translated to one of the 1404 * kcache banks later. */ 1405 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 1406 1407 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 1408 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + 1409 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; 1410 ctx.temp_reg = ctx.bc->ar_reg + 1; 1411 1412 if (indirect_gprs) { 1413 shader->max_arrays = 0; 1414 shader->num_arrays = 0; 1415 1416 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 1417 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 1418 ctx.file_offset[TGSI_FILE_OUTPUT] - 1419 ctx.file_offset[TGSI_FILE_INPUT], 1420 0x0F); 1421 } 1422 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 1423 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 1424 ctx.file_offset[TGSI_FILE_TEMPORARY] - 1425 ctx.file_offset[TGSI_FILE_OUTPUT], 1426 0x0F); 1427 } 1428 } 1429 1430 ctx.nliterals = 0; 1431 ctx.literals = NULL; 1432 shader->fs_write_all = FALSE; 1433 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 1434 tgsi_parse_token(&ctx.parse); 1435 switch (ctx.parse.FullToken.Token.Type) { 1436 case TGSI_TOKEN_TYPE_IMMEDIATE: 1437 immediate = &ctx.parse.FullToken.FullImmediate; 1438 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 1439 if(ctx.literals == NULL) { 1440 r = -ENOMEM; 1441 goto out_err; 1442 } 1443 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 1444 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 1445 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 1446 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 1447 ctx.nliterals++; 1448 break; 1449 case TGSI_TOKEN_TYPE_DECLARATION: 1450 r = tgsi_declaration(&ctx); 1451 if (r) 1452 goto out_err; 1453 break; 1454 case TGSI_TOKEN_TYPE_INSTRUCTION: 1455 break; 1456 case TGSI_TOKEN_TYPE_PROPERTY: 1457 property = &ctx.parse.FullToken.FullProperty; 1458 switch (property->Property.PropertyName) { 1459 case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: 1460 if (property->u[0].Data == 1) 1461 shader->fs_write_all = TRUE; 1462 break; 1463 case TGSI_PROPERTY_VS_PROHIBIT_UCPS: 1464 /* we don't need this one */ 1465 break; 1466 } 1467 break; 1468 default: 1469 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 1470 r = -EINVAL; 1471 goto out_err; 1472 } 1473 } 1474 1475 /* Process two side if needed */ 1476 if (shader->two_side && ctx.colors_used) { 1477 int i, count = ctx.shader->ninput; 1478 unsigned next_lds_loc = ctx.shader->nlds; 1479 1480 /* additional inputs will be allocated right after the existing inputs, 1481 * we won't need them after the color selection, so we don't need to 1482 * reserve these gprs for the rest of the shader code and to adjust 1483 * output offsets etc. */ 1484 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 1485 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 1486 1487 if (ctx.face_gpr == -1) { 1488 i = ctx.shader->ninput++; 1489 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 1490 ctx.shader->input[i].spi_sid = 0; 1491 ctx.shader->input[i].gpr = gpr++; 1492 ctx.face_gpr = ctx.shader->input[i].gpr; 1493 } 1494 1495 for (i = 0; i < count; i++) { 1496 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 1497 int ni = ctx.shader->ninput++; 1498 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 1499 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 1500 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 1501 ctx.shader->input[ni].gpr = gpr++; 1502 // TGSI to LLVM needs to know the lds position of inputs. 1503 // Non LLVM path computes it later (in process_twoside_color) 1504 ctx.shader->input[ni].lds_pos = next_lds_loc++; 1505 ctx.shader->input[i].back_color_input = ni; 1506 if (ctx.bc->chip_class >= EVERGREEN) { 1507 if ((r = evergreen_interp_input(&ctx, ni))) 1508 return r; 1509 } 1510 } 1511 } 1512 } 1513 1514/* LLVM backend setup */ 1515#ifdef R600_USE_LLVM 1516 if (use_llvm) { 1517 struct radeon_llvm_context radeon_llvm_ctx; 1518 LLVMModuleRef mod; 1519 bool dump = r600_can_dump_shader(rscreen, ctx.type); 1520 boolean use_kill = false; 1521 1522 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx)); 1523 radeon_llvm_ctx.type = ctx.type; 1524 radeon_llvm_ctx.two_side = shader->two_side; 1525 radeon_llvm_ctx.face_gpr = ctx.face_gpr; 1526 radeon_llvm_ctx.r600_inputs = ctx.shader->input; 1527 radeon_llvm_ctx.r600_outputs = ctx.shader->output; 1528 radeon_llvm_ctx.color_buffer_count = MAX2(key.nr_cbufs , 1); 1529 radeon_llvm_ctx.chip_class = ctx.bc->chip_class; 1530 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->chip_class >= EVERGREEN); 1531 radeon_llvm_ctx.stream_outputs = &so; 1532 radeon_llvm_ctx.clip_vertex = ctx.cv_output; 1533 radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; 1534 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens); 1535 1536 if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count, 1537 rscreen->family, ctx.bc, &use_kill, dump)) { 1538 FREE(inst_bytes); 1539 radeon_llvm_dispose(&radeon_llvm_ctx); 1540 use_llvm = 0; 1541 fprintf(stderr, "R600 LLVM backend failed to compile " 1542 "shader. Falling back to TGSI\n"); 1543 } else { 1544 ctx.file_offset[TGSI_FILE_OUTPUT] = 1545 ctx.file_offset[TGSI_FILE_INPUT]; 1546 } 1547 if (use_kill) 1548 ctx.shader->uses_kill = use_kill; 1549 radeon_llvm_dispose(&radeon_llvm_ctx); 1550 } 1551#endif 1552/* End of LLVM backend setup */ 1553 1554 if (shader->fs_write_all && rscreen->chip_class >= EVERGREEN) 1555 shader->nr_ps_max_color_exports = 8; 1556 1557 if (!use_llvm) { 1558 if (ctx.fragcoord_input >= 0) { 1559 if (ctx.bc->chip_class == CAYMAN) { 1560 for (j = 0 ; j < 4; j++) { 1561 struct r600_bytecode_alu alu; 1562 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1563 alu.op = ALU_OP1_RECIP_IEEE; 1564 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 1565 alu.src[0].chan = 3; 1566 1567 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 1568 alu.dst.chan = j; 1569 alu.dst.write = (j == 3); 1570 alu.last = 1; 1571 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 1572 return r; 1573 } 1574 } else { 1575 struct r600_bytecode_alu alu; 1576 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1577 alu.op = ALU_OP1_RECIP_IEEE; 1578 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 1579 alu.src[0].chan = 3; 1580 1581 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 1582 alu.dst.chan = 3; 1583 alu.dst.write = 1; 1584 alu.last = 1; 1585 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 1586 return r; 1587 } 1588 } 1589 1590 if (shader->two_side && ctx.colors_used) { 1591 if ((r = process_twoside_color_inputs(&ctx))) 1592 return r; 1593 } 1594 1595 tgsi_parse_init(&ctx.parse, tokens); 1596 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 1597 tgsi_parse_token(&ctx.parse); 1598 switch (ctx.parse.FullToken.Token.Type) { 1599 case TGSI_TOKEN_TYPE_INSTRUCTION: 1600 r = tgsi_is_supported(&ctx); 1601 if (r) 1602 goto out_err; 1603 ctx.max_driver_temp_used = 0; 1604 /* reserve first tmp for everyone */ 1605 r600_get_temp(&ctx); 1606 1607 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 1608 if ((r = tgsi_split_constant(&ctx))) 1609 goto out_err; 1610 if ((r = tgsi_split_literal_constant(&ctx))) 1611 goto out_err; 1612 if (ctx.bc->chip_class == CAYMAN) 1613 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 1614 else if (ctx.bc->chip_class >= EVERGREEN) 1615 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 1616 else 1617 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 1618 r = ctx.inst_info->process(&ctx); 1619 if (r) 1620 goto out_err; 1621 break; 1622 default: 1623 break; 1624 } 1625 } 1626 } 1627 1628 /* Reset the temporary register counter. */ 1629 ctx.max_driver_temp_used = 0; 1630 1631 /* Get instructions if we are using the LLVM backend. */ 1632 if (use_llvm) { 1633 r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count); 1634 FREE(inst_bytes); 1635 } 1636 1637 noutput = shader->noutput; 1638 1639 if (ctx.clip_vertex_write) { 1640 unsigned clipdist_temp[2]; 1641 1642 clipdist_temp[0] = r600_get_temp(&ctx); 1643 clipdist_temp[1] = r600_get_temp(&ctx); 1644 1645 /* need to convert a clipvertex write into clipdistance writes and not export 1646 the clip vertex anymore */ 1647 1648 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 1649 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 1650 shader->output[noutput].gpr = clipdist_temp[0]; 1651 noutput++; 1652 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 1653 shader->output[noutput].gpr = clipdist_temp[1]; 1654 noutput++; 1655 1656 /* reset spi_sid for clipvertex output to avoid confusing spi */ 1657 shader->output[ctx.cv_output].spi_sid = 0; 1658 1659 shader->clip_dist_write = 0xFF; 1660 1661 for (i = 0; i < 8; i++) { 1662 int oreg = i >> 2; 1663 int ochan = i & 3; 1664 1665 for (j = 0; j < 4; j++) { 1666 struct r600_bytecode_alu alu; 1667 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1668 alu.op = ALU_OP2_DOT4; 1669 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 1670 alu.src[0].chan = j; 1671 1672 alu.src[1].sel = 512 + i; 1673 alu.src[1].kc_bank = R600_UCP_CONST_BUFFER; 1674 alu.src[1].chan = j; 1675 1676 alu.dst.sel = clipdist_temp[oreg]; 1677 alu.dst.chan = j; 1678 alu.dst.write = (j == ochan); 1679 if (j == 3) 1680 alu.last = 1; 1681 if (!use_llvm) 1682 r = r600_bytecode_add_alu(ctx.bc, &alu); 1683 if (r) 1684 return r; 1685 } 1686 } 1687 } 1688 1689 /* Add stream outputs. */ 1690 if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs && !use_llvm) { 1691 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 1692 1693 /* Sanity checking. */ 1694 if (so.num_outputs > PIPE_MAX_SHADER_OUTPUTS) { 1695 R600_ERR("Too many stream outputs: %d\n", so.num_outputs); 1696 r = -EINVAL; 1697 goto out_err; 1698 } 1699 for (i = 0; i < so.num_outputs; i++) { 1700 if (so.output[i].output_buffer >= 4) { 1701 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 1702 so.output[i].output_buffer); 1703 r = -EINVAL; 1704 goto out_err; 1705 } 1706 } 1707 1708 /* Initialize locations where the outputs are stored. */ 1709 for (i = 0; i < so.num_outputs; i++) { 1710 so_gpr[i] = shader->output[so.output[i].register_index].gpr; 1711 1712 /* Lower outputs with dst_offset < start_component. 1713 * 1714 * We can only output 4D vectors with a write mask, e.g. we can 1715 * only output the W component at offset 3, etc. If we want 1716 * to store Y, Z, or W at buffer offset 0, we need to use MOV 1717 * to move it to X and output X. */ 1718 if (so.output[i].dst_offset < so.output[i].start_component) { 1719 unsigned tmp = r600_get_temp(&ctx); 1720 1721 for (j = 0; j < so.output[i].num_components; j++) { 1722 struct r600_bytecode_alu alu; 1723 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1724 alu.op = ALU_OP1_MOV; 1725 alu.src[0].sel = so_gpr[i]; 1726 alu.src[0].chan = so.output[i].start_component + j; 1727 1728 alu.dst.sel = tmp; 1729 alu.dst.chan = j; 1730 alu.dst.write = 1; 1731 if (j == so.output[i].num_components - 1) 1732 alu.last = 1; 1733 r = r600_bytecode_add_alu(ctx.bc, &alu); 1734 if (r) 1735 return r; 1736 } 1737 so.output[i].start_component = 0; 1738 so_gpr[i] = tmp; 1739 } 1740 } 1741 1742 /* Write outputs to buffers. */ 1743 for (i = 0; i < so.num_outputs; i++) { 1744 struct r600_bytecode_output output; 1745 1746 memset(&output, 0, sizeof(struct r600_bytecode_output)); 1747 output.gpr = so_gpr[i]; 1748 output.elem_size = so.output[i].num_components; 1749 output.array_base = so.output[i].dst_offset - so.output[i].start_component; 1750 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 1751 output.burst_count = 1; 1752 output.barrier = 1; 1753 /* array_size is an upper limit for the burst_count 1754 * with MEM_STREAM instructions */ 1755 output.array_size = 0xFFF; 1756 output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component; 1757 if (ctx.bc->chip_class >= EVERGREEN) { 1758 switch (so.output[i].output_buffer) { 1759 case 0: 1760 output.op = CF_OP_MEM_STREAM0_BUF0; 1761 break; 1762 case 1: 1763 output.op = CF_OP_MEM_STREAM0_BUF1; 1764 break; 1765 case 2: 1766 output.op = CF_OP_MEM_STREAM0_BUF2; 1767 break; 1768 case 3: 1769 output.op = CF_OP_MEM_STREAM0_BUF3; 1770 break; 1771 } 1772 } else { 1773 switch (so.output[i].output_buffer) { 1774 case 0: 1775 output.op = CF_OP_MEM_STREAM0; 1776 break; 1777 case 1: 1778 output.op = CF_OP_MEM_STREAM1; 1779 break; 1780 case 2: 1781 output.op = CF_OP_MEM_STREAM2; 1782 break; 1783 case 3: 1784 output.op = CF_OP_MEM_STREAM3; 1785 break; 1786 } 1787 } 1788 r = r600_bytecode_add_output(ctx.bc, &output); 1789 if (r) 1790 goto out_err; 1791 } 1792 } 1793 1794 /* export output */ 1795 for (i = 0, j = 0; i < noutput; i++, j++) { 1796 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1797 output[j].gpr = shader->output[i].gpr; 1798 output[j].elem_size = 3; 1799 output[j].swizzle_x = 0; 1800 output[j].swizzle_y = 1; 1801 output[j].swizzle_z = 2; 1802 output[j].swizzle_w = 3; 1803 output[j].burst_count = 1; 1804 output[j].barrier = 1; 1805 output[j].type = -1; 1806 output[j].op = CF_OP_EXPORT; 1807 switch (ctx.type) { 1808 case TGSI_PROCESSOR_VERTEX: 1809 switch (shader->output[i].name) { 1810 case TGSI_SEMANTIC_POSITION: 1811 output[j].array_base = next_pos_base++; 1812 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1813 break; 1814 1815 case TGSI_SEMANTIC_PSIZE: 1816 output[j].array_base = next_pos_base++; 1817 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1818 break; 1819 case TGSI_SEMANTIC_CLIPVERTEX: 1820 j--; 1821 break; 1822 case TGSI_SEMANTIC_CLIPDIST: 1823 output[j].array_base = next_pos_base++; 1824 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1825 /* spi_sid is 0 for clipdistance outputs that were generated 1826 * for clipvertex - we don't need to pass them to PS */ 1827 if (shader->output[i].spi_sid) { 1828 j++; 1829 /* duplicate it as PARAM to pass to the pixel shader */ 1830 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 1831 output[j].array_base = next_param_base++; 1832 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1833 } 1834 break; 1835 case TGSI_SEMANTIC_FOG: 1836 output[j].swizzle_y = 4; /* 0 */ 1837 output[j].swizzle_z = 4; /* 0 */ 1838 output[j].swizzle_w = 5; /* 1 */ 1839 break; 1840 } 1841 break; 1842 case TGSI_PROCESSOR_FRAGMENT: 1843 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 1844 /* never export more colors than the number of CBs */ 1845 if (next_pixel_base && next_pixel_base >= key.nr_cbufs) { 1846 /* skip export */ 1847 j--; 1848 continue; 1849 } 1850 output[j].swizzle_w = key.alpha_to_one ? 5 : 3; 1851 output[j].array_base = next_pixel_base++; 1852 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1853 shader->nr_ps_color_exports++; 1854 if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) { 1855 for (k = 1; k < key.nr_cbufs; k++) { 1856 j++; 1857 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1858 output[j].gpr = shader->output[i].gpr; 1859 output[j].elem_size = 3; 1860 output[j].swizzle_x = 0; 1861 output[j].swizzle_y = 1; 1862 output[j].swizzle_z = 2; 1863 output[j].swizzle_w = key.alpha_to_one ? 5 : 3; 1864 output[j].burst_count = 1; 1865 output[j].barrier = 1; 1866 output[j].array_base = next_pixel_base++; 1867 output[j].op = CF_OP_EXPORT; 1868 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1869 shader->nr_ps_color_exports++; 1870 } 1871 } 1872 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 1873 output[j].array_base = 61; 1874 output[j].swizzle_x = 2; 1875 output[j].swizzle_y = 7; 1876 output[j].swizzle_z = output[j].swizzle_w = 7; 1877 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1878 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 1879 output[j].array_base = 61; 1880 output[j].swizzle_x = 7; 1881 output[j].swizzle_y = 1; 1882 output[j].swizzle_z = output[j].swizzle_w = 7; 1883 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1884 } else { 1885 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 1886 r = -EINVAL; 1887 goto out_err; 1888 } 1889 break; 1890 default: 1891 R600_ERR("unsupported processor type %d\n", ctx.type); 1892 r = -EINVAL; 1893 goto out_err; 1894 } 1895 1896 if (output[j].type==-1) { 1897 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1898 output[j].array_base = next_param_base++; 1899 } 1900 } 1901 1902 /* add fake position export */ 1903 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_pos_base == 60) { 1904 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1905 output[j].gpr = 0; 1906 output[j].elem_size = 3; 1907 output[j].swizzle_x = 7; 1908 output[j].swizzle_y = 7; 1909 output[j].swizzle_z = 7; 1910 output[j].swizzle_w = 7; 1911 output[j].burst_count = 1; 1912 output[j].barrier = 1; 1913 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1914 output[j].array_base = next_pos_base; 1915 output[j].op = CF_OP_EXPORT; 1916 j++; 1917 } 1918 1919 /* add fake param output for vertex shader if no param is exported */ 1920 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) { 1921 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1922 output[j].gpr = 0; 1923 output[j].elem_size = 3; 1924 output[j].swizzle_x = 7; 1925 output[j].swizzle_y = 7; 1926 output[j].swizzle_z = 7; 1927 output[j].swizzle_w = 7; 1928 output[j].burst_count = 1; 1929 output[j].barrier = 1; 1930 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1931 output[j].array_base = 0; 1932 output[j].op = CF_OP_EXPORT; 1933 j++; 1934 } 1935 1936 /* add fake pixel export */ 1937 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) { 1938 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1939 output[j].gpr = 0; 1940 output[j].elem_size = 3; 1941 output[j].swizzle_x = 7; 1942 output[j].swizzle_y = 7; 1943 output[j].swizzle_z = 7; 1944 output[j].swizzle_w = 7; 1945 output[j].burst_count = 1; 1946 output[j].barrier = 1; 1947 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1948 output[j].array_base = 0; 1949 output[j].op = CF_OP_EXPORT; 1950 j++; 1951 } 1952 1953 noutput = j; 1954 1955 /* set export done on last export of each type */ 1956 for (i = noutput - 1, output_done = 0; i >= 0; i--) { 1957 if (ctx.bc->chip_class < CAYMAN) { 1958 if (i == (noutput - 1)) { 1959 output[i].end_of_program = 1; 1960 } 1961 } 1962 if (!(output_done & (1 << output[i].type))) { 1963 output_done |= (1 << output[i].type); 1964 output[i].op = CF_OP_EXPORT_DONE; 1965 } 1966 } 1967 /* add output to bytecode */ 1968 if (!use_llvm) { 1969 for (i = 0; i < noutput; i++) { 1970 r = r600_bytecode_add_output(ctx.bc, &output[i]); 1971 if (r) 1972 goto out_err; 1973 } 1974 } 1975 /* add program end */ 1976 if (!use_llvm && ctx.bc->chip_class == CAYMAN) 1977 cm_bytecode_add_cf_end(ctx.bc); 1978 1979 /* check GPR limit - we have 124 = 128 - 4 1980 * (4 are reserved as alu clause temporary registers) */ 1981 if (ctx.bc->ngpr > 124) { 1982 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 1983 r = -ENOMEM; 1984 goto out_err; 1985 } 1986 1987 free(ctx.literals); 1988 tgsi_parse_free(&ctx.parse); 1989 return 0; 1990out_err: 1991 free(ctx.literals); 1992 tgsi_parse_free(&ctx.parse); 1993 return r; 1994} 1995 1996static int tgsi_unsupported(struct r600_shader_ctx *ctx) 1997{ 1998 R600_ERR("%s tgsi opcode unsupported\n", 1999 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode)); 2000 return -EINVAL; 2001} 2002 2003static int tgsi_end(struct r600_shader_ctx *ctx) 2004{ 2005 return 0; 2006} 2007 2008static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 2009 const struct r600_shader_src *shader_src, 2010 unsigned chan) 2011{ 2012 bc_src->sel = shader_src->sel; 2013 bc_src->chan = shader_src->swizzle[chan]; 2014 bc_src->neg = shader_src->neg; 2015 bc_src->abs = shader_src->abs; 2016 bc_src->rel = shader_src->rel; 2017 bc_src->value = shader_src->value[bc_src->chan]; 2018 bc_src->kc_bank = shader_src->kc_bank; 2019} 2020 2021static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 2022{ 2023 bc_src->abs = 1; 2024 bc_src->neg = 0; 2025} 2026 2027static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 2028{ 2029 bc_src->neg = !bc_src->neg; 2030} 2031 2032static void tgsi_dst(struct r600_shader_ctx *ctx, 2033 const struct tgsi_full_dst_register *tgsi_dst, 2034 unsigned swizzle, 2035 struct r600_bytecode_alu_dst *r600_dst) 2036{ 2037 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2038 2039 r600_dst->sel = tgsi_dst->Register.Index; 2040 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 2041 r600_dst->chan = swizzle; 2042 r600_dst->write = 1; 2043 if (tgsi_dst->Register.Indirect) 2044 r600_dst->rel = V_SQ_REL_RELATIVE; 2045 if (inst->Instruction.Saturate) { 2046 r600_dst->clamp = 1; 2047 } 2048} 2049 2050static int tgsi_last_instruction(unsigned writemask) 2051{ 2052 int i, lasti = 0; 2053 2054 for (i = 0; i < 4; i++) { 2055 if (writemask & (1 << i)) { 2056 lasti = i; 2057 } 2058 } 2059 return lasti; 2060} 2061 2062static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 2063{ 2064 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2065 struct r600_bytecode_alu alu; 2066 int i, j, r; 2067 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 2068 2069 for (i = 0; i < lasti + 1; i++) { 2070 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 2071 continue; 2072 2073 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2074 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2075 2076 alu.op = ctx->inst_info->op; 2077 if (!swap) { 2078 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 2079 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 2080 } 2081 } else { 2082 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 2083 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2084 } 2085 /* handle some special cases */ 2086 switch (ctx->inst_info->tgsi_opcode) { 2087 case TGSI_OPCODE_SUB: 2088 r600_bytecode_src_toggle_neg(&alu.src[1]); 2089 break; 2090 case TGSI_OPCODE_ABS: 2091 r600_bytecode_src_set_abs(&alu.src[0]); 2092 break; 2093 default: 2094 break; 2095 } 2096 if (i == lasti || trans_only) { 2097 alu.last = 1; 2098 } 2099 r = r600_bytecode_add_alu(ctx->bc, &alu); 2100 if (r) 2101 return r; 2102 } 2103 return 0; 2104} 2105 2106static int tgsi_op2(struct r600_shader_ctx *ctx) 2107{ 2108 return tgsi_op2_s(ctx, 0, 0); 2109} 2110 2111static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 2112{ 2113 return tgsi_op2_s(ctx, 1, 0); 2114} 2115 2116static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 2117{ 2118 return tgsi_op2_s(ctx, 0, 1); 2119} 2120 2121static int tgsi_ineg(struct r600_shader_ctx *ctx) 2122{ 2123 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2124 struct r600_bytecode_alu alu; 2125 int i, r; 2126 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 2127 2128 for (i = 0; i < lasti + 1; i++) { 2129 2130 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 2131 continue; 2132 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2133 alu.op = ctx->inst_info->op; 2134 2135 alu.src[0].sel = V_SQ_ALU_SRC_0; 2136 2137 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2138 2139 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2140 2141 if (i == lasti) { 2142 alu.last = 1; 2143 } 2144 r = r600_bytecode_add_alu(ctx->bc, &alu); 2145 if (r) 2146 return r; 2147 } 2148 return 0; 2149 2150} 2151 2152static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 2153{ 2154 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2155 int i, j, r; 2156 struct r600_bytecode_alu alu; 2157 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2158 2159 for (i = 0 ; i < last_slot; i++) { 2160 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2161 alu.op = ctx->inst_info->op; 2162 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 2163 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 2164 2165 /* RSQ should take the absolute value of src */ 2166 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) { 2167 r600_bytecode_src_set_abs(&alu.src[j]); 2168 } 2169 } 2170 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2171 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2172 2173 if (i == last_slot - 1) 2174 alu.last = 1; 2175 r = r600_bytecode_add_alu(ctx->bc, &alu); 2176 if (r) 2177 return r; 2178 } 2179 return 0; 2180} 2181 2182static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 2183{ 2184 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2185 int i, j, k, r; 2186 struct r600_bytecode_alu alu; 2187 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2188 for (k = 0; k < last_slot; k++) { 2189 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 2190 continue; 2191 2192 for (i = 0 ; i < 4; i++) { 2193 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2194 alu.op = ctx->inst_info->op; 2195 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 2196 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 2197 } 2198 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2199 alu.dst.write = (i == k); 2200 if (i == 3) 2201 alu.last = 1; 2202 r = r600_bytecode_add_alu(ctx->bc, &alu); 2203 if (r) 2204 return r; 2205 } 2206 } 2207 return 0; 2208} 2209 2210/* 2211 * r600 - trunc to -PI..PI range 2212 * r700 - normalize by dividing by 2PI 2213 * see fdo bug 27901 2214 */ 2215static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 2216{ 2217 static float half_inv_pi = 1.0 /(3.1415926535 * 2); 2218 static float double_pi = 3.1415926535 * 2; 2219 static float neg_pi = -3.1415926535; 2220 2221 int r; 2222 struct r600_bytecode_alu alu; 2223 2224 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2225 alu.op = ALU_OP3_MULADD; 2226 alu.is_op3 = 1; 2227 2228 alu.dst.chan = 0; 2229 alu.dst.sel = ctx->temp_reg; 2230 alu.dst.write = 1; 2231 2232 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2233 2234 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2235 alu.src[1].chan = 0; 2236 alu.src[1].value = *(uint32_t *)&half_inv_pi; 2237 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 2238 alu.src[2].chan = 0; 2239 alu.last = 1; 2240 r = r600_bytecode_add_alu(ctx->bc, &alu); 2241 if (r) 2242 return r; 2243 2244 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2245 alu.op = ALU_OP1_FRACT; 2246 2247 alu.dst.chan = 0; 2248 alu.dst.sel = ctx->temp_reg; 2249 alu.dst.write = 1; 2250 2251 alu.src[0].sel = ctx->temp_reg; 2252 alu.src[0].chan = 0; 2253 alu.last = 1; 2254 r = r600_bytecode_add_alu(ctx->bc, &alu); 2255 if (r) 2256 return r; 2257 2258 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2259 alu.op = ALU_OP3_MULADD; 2260 alu.is_op3 = 1; 2261 2262 alu.dst.chan = 0; 2263 alu.dst.sel = ctx->temp_reg; 2264 alu.dst.write = 1; 2265 2266 alu.src[0].sel = ctx->temp_reg; 2267 alu.src[0].chan = 0; 2268 2269 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2270 alu.src[1].chan = 0; 2271 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 2272 alu.src[2].chan = 0; 2273 2274 if (ctx->bc->chip_class == R600) { 2275 alu.src[1].value = *(uint32_t *)&double_pi; 2276 alu.src[2].value = *(uint32_t *)&neg_pi; 2277 } else { 2278 alu.src[1].sel = V_SQ_ALU_SRC_1; 2279 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 2280 alu.src[2].neg = 1; 2281 } 2282 2283 alu.last = 1; 2284 r = r600_bytecode_add_alu(ctx->bc, &alu); 2285 if (r) 2286 return r; 2287 return 0; 2288} 2289 2290static int cayman_trig(struct r600_shader_ctx *ctx) 2291{ 2292 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2293 struct r600_bytecode_alu alu; 2294 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2295 int i, r; 2296 2297 r = tgsi_setup_trig(ctx); 2298 if (r) 2299 return r; 2300 2301 2302 for (i = 0; i < last_slot; i++) { 2303 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2304 alu.op = ctx->inst_info->op; 2305 alu.dst.chan = i; 2306 2307 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2308 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2309 2310 alu.src[0].sel = ctx->temp_reg; 2311 alu.src[0].chan = 0; 2312 if (i == last_slot - 1) 2313 alu.last = 1; 2314 r = r600_bytecode_add_alu(ctx->bc, &alu); 2315 if (r) 2316 return r; 2317 } 2318 return 0; 2319} 2320 2321static int tgsi_trig(struct r600_shader_ctx *ctx) 2322{ 2323 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2324 struct r600_bytecode_alu alu; 2325 int i, r; 2326 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 2327 2328 r = tgsi_setup_trig(ctx); 2329 if (r) 2330 return r; 2331 2332 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2333 alu.op = ctx->inst_info->op; 2334 alu.dst.chan = 0; 2335 alu.dst.sel = ctx->temp_reg; 2336 alu.dst.write = 1; 2337 2338 alu.src[0].sel = ctx->temp_reg; 2339 alu.src[0].chan = 0; 2340 alu.last = 1; 2341 r = r600_bytecode_add_alu(ctx->bc, &alu); 2342 if (r) 2343 return r; 2344 2345 /* replicate result */ 2346 for (i = 0; i < lasti + 1; i++) { 2347 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 2348 continue; 2349 2350 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2351 alu.op = ALU_OP1_MOV; 2352 2353 alu.src[0].sel = ctx->temp_reg; 2354 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2355 if (i == lasti) 2356 alu.last = 1; 2357 r = r600_bytecode_add_alu(ctx->bc, &alu); 2358 if (r) 2359 return r; 2360 } 2361 return 0; 2362} 2363 2364static int tgsi_scs(struct r600_shader_ctx *ctx) 2365{ 2366 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2367 struct r600_bytecode_alu alu; 2368 int i, r; 2369 2370 /* We'll only need the trig stuff if we are going to write to the 2371 * X or Y components of the destination vector. 2372 */ 2373 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) { 2374 r = tgsi_setup_trig(ctx); 2375 if (r) 2376 return r; 2377 } 2378 2379 /* dst.x = COS */ 2380 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) { 2381 if (ctx->bc->chip_class == CAYMAN) { 2382 for (i = 0 ; i < 3; i++) { 2383 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2384 alu.op = ALU_OP1_COS; 2385 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2386 2387 if (i == 0) 2388 alu.dst.write = 1; 2389 else 2390 alu.dst.write = 0; 2391 alu.src[0].sel = ctx->temp_reg; 2392 alu.src[0].chan = 0; 2393 if (i == 2) 2394 alu.last = 1; 2395 r = r600_bytecode_add_alu(ctx->bc, &alu); 2396 if (r) 2397 return r; 2398 } 2399 } else { 2400 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2401 alu.op = ALU_OP1_COS; 2402 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 2403 2404 alu.src[0].sel = ctx->temp_reg; 2405 alu.src[0].chan = 0; 2406 alu.last = 1; 2407 r = r600_bytecode_add_alu(ctx->bc, &alu); 2408 if (r) 2409 return r; 2410 } 2411 } 2412 2413 /* dst.y = SIN */ 2414 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) { 2415 if (ctx->bc->chip_class == CAYMAN) { 2416 for (i = 0 ; i < 3; i++) { 2417 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2418 alu.op = ALU_OP1_SIN; 2419 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2420 if (i == 1) 2421 alu.dst.write = 1; 2422 else 2423 alu.dst.write = 0; 2424 alu.src[0].sel = ctx->temp_reg; 2425 alu.src[0].chan = 0; 2426 if (i == 2) 2427 alu.last = 1; 2428 r = r600_bytecode_add_alu(ctx->bc, &alu); 2429 if (r) 2430 return r; 2431 } 2432 } else { 2433 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2434 alu.op = ALU_OP1_SIN; 2435 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 2436 2437 alu.src[0].sel = ctx->temp_reg; 2438 alu.src[0].chan = 0; 2439 alu.last = 1; 2440 r = r600_bytecode_add_alu(ctx->bc, &alu); 2441 if (r) 2442 return r; 2443 } 2444 } 2445 2446 /* dst.z = 0.0; */ 2447 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) { 2448 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2449 2450 alu.op = ALU_OP1_MOV; 2451 2452 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 2453 2454 alu.src[0].sel = V_SQ_ALU_SRC_0; 2455 alu.src[0].chan = 0; 2456 2457 alu.last = 1; 2458 2459 r = r600_bytecode_add_alu(ctx->bc, &alu); 2460 if (r) 2461 return r; 2462 } 2463 2464 /* dst.w = 1.0; */ 2465 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) { 2466 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2467 2468 alu.op = ALU_OP1_MOV; 2469 2470 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 2471 2472 alu.src[0].sel = V_SQ_ALU_SRC_1; 2473 alu.src[0].chan = 0; 2474 2475 alu.last = 1; 2476 2477 r = r600_bytecode_add_alu(ctx->bc, &alu); 2478 if (r) 2479 return r; 2480 } 2481 2482 return 0; 2483} 2484 2485static int tgsi_kill(struct r600_shader_ctx *ctx) 2486{ 2487 struct r600_bytecode_alu alu; 2488 int i, r; 2489 2490 for (i = 0; i < 4; i++) { 2491 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2492 alu.op = ctx->inst_info->op; 2493 2494 alu.dst.chan = i; 2495 2496 alu.src[0].sel = V_SQ_ALU_SRC_0; 2497 2498 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) { 2499 alu.src[1].sel = V_SQ_ALU_SRC_1; 2500 alu.src[1].neg = 1; 2501 } else { 2502 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2503 } 2504 if (i == 3) { 2505 alu.last = 1; 2506 } 2507 r = r600_bytecode_add_alu(ctx->bc, &alu); 2508 if (r) 2509 return r; 2510 } 2511 2512 /* kill must be last in ALU */ 2513 ctx->bc->force_add_cf = 1; 2514 ctx->shader->uses_kill = TRUE; 2515 return 0; 2516} 2517 2518static int tgsi_lit(struct r600_shader_ctx *ctx) 2519{ 2520 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2521 struct r600_bytecode_alu alu; 2522 int r; 2523 2524 /* tmp.x = max(src.y, 0.0) */ 2525 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2526 alu.op = ALU_OP2_MAX; 2527 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 2528 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 2529 alu.src[1].chan = 1; 2530 2531 alu.dst.sel = ctx->temp_reg; 2532 alu.dst.chan = 0; 2533 alu.dst.write = 1; 2534 2535 alu.last = 1; 2536 r = r600_bytecode_add_alu(ctx->bc, &alu); 2537 if (r) 2538 return r; 2539 2540 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 2541 { 2542 int chan; 2543 int sel; 2544 int i; 2545 2546 if (ctx->bc->chip_class == CAYMAN) { 2547 for (i = 0; i < 3; i++) { 2548 /* tmp.z = log(tmp.x) */ 2549 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2550 alu.op = ALU_OP1_LOG_CLAMPED; 2551 alu.src[0].sel = ctx->temp_reg; 2552 alu.src[0].chan = 0; 2553 alu.dst.sel = ctx->temp_reg; 2554 alu.dst.chan = i; 2555 if (i == 2) { 2556 alu.dst.write = 1; 2557 alu.last = 1; 2558 } else 2559 alu.dst.write = 0; 2560 2561 r = r600_bytecode_add_alu(ctx->bc, &alu); 2562 if (r) 2563 return r; 2564 } 2565 } else { 2566 /* tmp.z = log(tmp.x) */ 2567 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2568 alu.op = ALU_OP1_LOG_CLAMPED; 2569 alu.src[0].sel = ctx->temp_reg; 2570 alu.src[0].chan = 0; 2571 alu.dst.sel = ctx->temp_reg; 2572 alu.dst.chan = 2; 2573 alu.dst.write = 1; 2574 alu.last = 1; 2575 r = r600_bytecode_add_alu(ctx->bc, &alu); 2576 if (r) 2577 return r; 2578 } 2579 2580 chan = alu.dst.chan; 2581 sel = alu.dst.sel; 2582 2583 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 2584 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2585 alu.op = ALU_OP3_MUL_LIT; 2586 alu.src[0].sel = sel; 2587 alu.src[0].chan = chan; 2588 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 2589 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 2590 alu.dst.sel = ctx->temp_reg; 2591 alu.dst.chan = 0; 2592 alu.dst.write = 1; 2593 alu.is_op3 = 1; 2594 alu.last = 1; 2595 r = r600_bytecode_add_alu(ctx->bc, &alu); 2596 if (r) 2597 return r; 2598 2599 if (ctx->bc->chip_class == CAYMAN) { 2600 for (i = 0; i < 3; i++) { 2601 /* dst.z = exp(tmp.x) */ 2602 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2603 alu.op = ALU_OP1_EXP_IEEE; 2604 alu.src[0].sel = ctx->temp_reg; 2605 alu.src[0].chan = 0; 2606 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2607 if (i == 2) { 2608 alu.dst.write = 1; 2609 alu.last = 1; 2610 } else 2611 alu.dst.write = 0; 2612 r = r600_bytecode_add_alu(ctx->bc, &alu); 2613 if (r) 2614 return r; 2615 } 2616 } else { 2617 /* dst.z = exp(tmp.x) */ 2618 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2619 alu.op = ALU_OP1_EXP_IEEE; 2620 alu.src[0].sel = ctx->temp_reg; 2621 alu.src[0].chan = 0; 2622 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 2623 alu.last = 1; 2624 r = r600_bytecode_add_alu(ctx->bc, &alu); 2625 if (r) 2626 return r; 2627 } 2628 } 2629 2630 /* dst.x, <- 1.0 */ 2631 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2632 alu.op = ALU_OP1_MOV; 2633 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 2634 alu.src[0].chan = 0; 2635 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 2636 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 2637 r = r600_bytecode_add_alu(ctx->bc, &alu); 2638 if (r) 2639 return r; 2640 2641 /* dst.y = max(src.x, 0.0) */ 2642 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2643 alu.op = ALU_OP2_MAX; 2644 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2645 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 2646 alu.src[1].chan = 0; 2647 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 2648 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 2649 r = r600_bytecode_add_alu(ctx->bc, &alu); 2650 if (r) 2651 return r; 2652 2653 /* dst.w, <- 1.0 */ 2654 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2655 alu.op = ALU_OP1_MOV; 2656 alu.src[0].sel = V_SQ_ALU_SRC_1; 2657 alu.src[0].chan = 0; 2658 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 2659 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 2660 alu.last = 1; 2661 r = r600_bytecode_add_alu(ctx->bc, &alu); 2662 if (r) 2663 return r; 2664 2665 return 0; 2666} 2667 2668static int tgsi_rsq(struct r600_shader_ctx *ctx) 2669{ 2670 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2671 struct r600_bytecode_alu alu; 2672 int i, r; 2673 2674 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2675 2676 /* XXX: 2677 * For state trackers other than OpenGL, we'll want to use 2678 * _RECIPSQRT_IEEE instead. 2679 */ 2680 alu.op = ALU_OP1_RECIPSQRT_CLAMPED; 2681 2682 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2683 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 2684 r600_bytecode_src_set_abs(&alu.src[i]); 2685 } 2686 alu.dst.sel = ctx->temp_reg; 2687 alu.dst.write = 1; 2688 alu.last = 1; 2689 r = r600_bytecode_add_alu(ctx->bc, &alu); 2690 if (r) 2691 return r; 2692 /* replicate result */ 2693 return tgsi_helper_tempx_replicate(ctx); 2694} 2695 2696static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 2697{ 2698 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2699 struct r600_bytecode_alu alu; 2700 int i, r; 2701 2702 for (i = 0; i < 4; i++) { 2703 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2704 alu.src[0].sel = ctx->temp_reg; 2705 alu.op = ALU_OP1_MOV; 2706 alu.dst.chan = i; 2707 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2708 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2709 if (i == 3) 2710 alu.last = 1; 2711 r = r600_bytecode_add_alu(ctx->bc, &alu); 2712 if (r) 2713 return r; 2714 } 2715 return 0; 2716} 2717 2718static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 2719{ 2720 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2721 struct r600_bytecode_alu alu; 2722 int i, r; 2723 2724 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2725 alu.op = ctx->inst_info->op; 2726 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2727 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 2728 } 2729 alu.dst.sel = ctx->temp_reg; 2730 alu.dst.write = 1; 2731 alu.last = 1; 2732 r = r600_bytecode_add_alu(ctx->bc, &alu); 2733 if (r) 2734 return r; 2735 /* replicate result */ 2736 return tgsi_helper_tempx_replicate(ctx); 2737} 2738 2739static int cayman_pow(struct r600_shader_ctx *ctx) 2740{ 2741 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2742 int i, r; 2743 struct r600_bytecode_alu alu; 2744 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2745 2746 for (i = 0; i < 3; i++) { 2747 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2748 alu.op = ALU_OP1_LOG_IEEE; 2749 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2750 alu.dst.sel = ctx->temp_reg; 2751 alu.dst.chan = i; 2752 alu.dst.write = 1; 2753 if (i == 2) 2754 alu.last = 1; 2755 r = r600_bytecode_add_alu(ctx->bc, &alu); 2756 if (r) 2757 return r; 2758 } 2759 2760 /* b * LOG2(a) */ 2761 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2762 alu.op = ALU_OP2_MUL; 2763 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 2764 alu.src[1].sel = ctx->temp_reg; 2765 alu.dst.sel = ctx->temp_reg; 2766 alu.dst.write = 1; 2767 alu.last = 1; 2768 r = r600_bytecode_add_alu(ctx->bc, &alu); 2769 if (r) 2770 return r; 2771 2772 for (i = 0; i < last_slot; i++) { 2773 /* POW(a,b) = EXP2(b * LOG2(a))*/ 2774 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2775 alu.op = ALU_OP1_EXP_IEEE; 2776 alu.src[0].sel = ctx->temp_reg; 2777 2778 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2779 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2780 if (i == last_slot - 1) 2781 alu.last = 1; 2782 r = r600_bytecode_add_alu(ctx->bc, &alu); 2783 if (r) 2784 return r; 2785 } 2786 return 0; 2787} 2788 2789static int tgsi_pow(struct r600_shader_ctx *ctx) 2790{ 2791 struct r600_bytecode_alu alu; 2792 int r; 2793 2794 /* LOG2(a) */ 2795 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2796 alu.op = ALU_OP1_LOG_IEEE; 2797 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2798 alu.dst.sel = ctx->temp_reg; 2799 alu.dst.write = 1; 2800 alu.last = 1; 2801 r = r600_bytecode_add_alu(ctx->bc, &alu); 2802 if (r) 2803 return r; 2804 /* b * LOG2(a) */ 2805 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2806 alu.op = ALU_OP2_MUL; 2807 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 2808 alu.src[1].sel = ctx->temp_reg; 2809 alu.dst.sel = ctx->temp_reg; 2810 alu.dst.write = 1; 2811 alu.last = 1; 2812 r = r600_bytecode_add_alu(ctx->bc, &alu); 2813 if (r) 2814 return r; 2815 /* POW(a,b) = EXP2(b * LOG2(a))*/ 2816 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2817 alu.op = ALU_OP1_EXP_IEEE; 2818 alu.src[0].sel = ctx->temp_reg; 2819 alu.dst.sel = ctx->temp_reg; 2820 alu.dst.write = 1; 2821 alu.last = 1; 2822 r = r600_bytecode_add_alu(ctx->bc, &alu); 2823 if (r) 2824 return r; 2825 return tgsi_helper_tempx_replicate(ctx); 2826} 2827 2828static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 2829{ 2830 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2831 struct r600_bytecode_alu alu; 2832 int i, r, j; 2833 unsigned write_mask = inst->Dst[0].Register.WriteMask; 2834 int tmp0 = ctx->temp_reg; 2835 int tmp1 = r600_get_temp(ctx); 2836 int tmp2 = r600_get_temp(ctx); 2837 int tmp3 = r600_get_temp(ctx); 2838 /* Unsigned path: 2839 * 2840 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 2841 * 2842 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 2843 * 2. tmp0.z = lo (tmp0.x * src2) 2844 * 3. tmp0.w = -tmp0.z 2845 * 4. tmp0.y = hi (tmp0.x * src2) 2846 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 2847 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 2848 * 7. tmp1.x = tmp0.x - tmp0.w 2849 * 8. tmp1.y = tmp0.x + tmp0.w 2850 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 2851 * 10. tmp0.z = hi(tmp0.x * src1) = q 2852 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 2853 * 2854 * 12. tmp0.w = src1 - tmp0.y = r 2855 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 2856 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 2857 * 2858 * if DIV 2859 * 2860 * 15. tmp1.z = tmp0.z + 1 = q + 1 2861 * 16. tmp1.w = tmp0.z - 1 = q - 1 2862 * 2863 * else MOD 2864 * 2865 * 15. tmp1.z = tmp0.w - src2 = r - src2 2866 * 16. tmp1.w = tmp0.w + src2 = r + src2 2867 * 2868 * endif 2869 * 2870 * 17. tmp1.x = tmp1.x & tmp1.y 2871 * 2872 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 2873 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 2874 * 2875 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 2876 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 2877 * 2878 * Signed path: 2879 * 2880 * Same as unsigned, using abs values of the operands, 2881 * and fixing the sign of the result in the end. 2882 */ 2883 2884 for (i = 0; i < 4; i++) { 2885 if (!(write_mask & (1<<i))) 2886 continue; 2887 2888 if (signed_op) { 2889 2890 /* tmp2.x = -src0 */ 2891 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2892 alu.op = ALU_OP2_SUB_INT; 2893 2894 alu.dst.sel = tmp2; 2895 alu.dst.chan = 0; 2896 alu.dst.write = 1; 2897 2898 alu.src[0].sel = V_SQ_ALU_SRC_0; 2899 2900 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2901 2902 alu.last = 1; 2903 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2904 return r; 2905 2906 /* tmp2.y = -src1 */ 2907 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2908 alu.op = ALU_OP2_SUB_INT; 2909 2910 alu.dst.sel = tmp2; 2911 alu.dst.chan = 1; 2912 alu.dst.write = 1; 2913 2914 alu.src[0].sel = V_SQ_ALU_SRC_0; 2915 2916 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 2917 2918 alu.last = 1; 2919 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2920 return r; 2921 2922 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 2923 /* it will be a sign of the quotient */ 2924 if (!mod) { 2925 2926 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2927 alu.op = ALU_OP2_XOR_INT; 2928 2929 alu.dst.sel = tmp2; 2930 alu.dst.chan = 2; 2931 alu.dst.write = 1; 2932 2933 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 2934 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 2935 2936 alu.last = 1; 2937 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2938 return r; 2939 } 2940 2941 /* tmp2.x = |src0| */ 2942 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2943 alu.op = ALU_OP3_CNDGE_INT; 2944 alu.is_op3 = 1; 2945 2946 alu.dst.sel = tmp2; 2947 alu.dst.chan = 0; 2948 alu.dst.write = 1; 2949 2950 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 2951 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2952 alu.src[2].sel = tmp2; 2953 alu.src[2].chan = 0; 2954 2955 alu.last = 1; 2956 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2957 return r; 2958 2959 /* tmp2.y = |src1| */ 2960 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2961 alu.op = ALU_OP3_CNDGE_INT; 2962 alu.is_op3 = 1; 2963 2964 alu.dst.sel = tmp2; 2965 alu.dst.chan = 1; 2966 alu.dst.write = 1; 2967 2968 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 2969 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 2970 alu.src[2].sel = tmp2; 2971 alu.src[2].chan = 1; 2972 2973 alu.last = 1; 2974 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2975 return r; 2976 2977 } 2978 2979 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 2980 if (ctx->bc->chip_class == CAYMAN) { 2981 /* tmp3.x = u2f(src2) */ 2982 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2983 alu.op = ALU_OP1_UINT_TO_FLT; 2984 2985 alu.dst.sel = tmp3; 2986 alu.dst.chan = 0; 2987 alu.dst.write = 1; 2988 2989 if (signed_op) { 2990 alu.src[0].sel = tmp2; 2991 alu.src[0].chan = 1; 2992 } else { 2993 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 2994 } 2995 2996 alu.last = 1; 2997 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2998 return r; 2999 3000 /* tmp0.x = recip(tmp3.x) */ 3001 for (j = 0 ; j < 3; j++) { 3002 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3003 alu.op = ALU_OP1_RECIP_IEEE; 3004 3005 alu.dst.sel = tmp0; 3006 alu.dst.chan = j; 3007 alu.dst.write = (j == 0); 3008 3009 alu.src[0].sel = tmp3; 3010 alu.src[0].chan = 0; 3011 3012 if (j == 2) 3013 alu.last = 1; 3014 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3015 return r; 3016 } 3017 3018 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3019 alu.op = ALU_OP2_MUL; 3020 3021 alu.src[0].sel = tmp0; 3022 alu.src[0].chan = 0; 3023 3024 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3025 alu.src[1].value = 0x4f800000; 3026 3027 alu.dst.sel = tmp3; 3028 alu.dst.write = 1; 3029 alu.last = 1; 3030 r = r600_bytecode_add_alu(ctx->bc, &alu); 3031 if (r) 3032 return r; 3033 3034 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3035 alu.op = ALU_OP1_FLT_TO_UINT; 3036 3037 alu.dst.sel = tmp0; 3038 alu.dst.chan = 0; 3039 alu.dst.write = 1; 3040 3041 alu.src[0].sel = tmp3; 3042 alu.src[0].chan = 0; 3043 3044 alu.last = 1; 3045 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3046 return r; 3047 3048 } else { 3049 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3050 alu.op = ALU_OP1_RECIP_UINT; 3051 3052 alu.dst.sel = tmp0; 3053 alu.dst.chan = 0; 3054 alu.dst.write = 1; 3055 3056 if (signed_op) { 3057 alu.src[0].sel = tmp2; 3058 alu.src[0].chan = 1; 3059 } else { 3060 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 3061 } 3062 3063 alu.last = 1; 3064 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3065 return r; 3066 } 3067 3068 /* 2. tmp0.z = lo (tmp0.x * src2) */ 3069 if (ctx->bc->chip_class == CAYMAN) { 3070 for (j = 0 ; j < 4; j++) { 3071 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3072 alu.op = ALU_OP2_MULLO_UINT; 3073 3074 alu.dst.sel = tmp0; 3075 alu.dst.chan = j; 3076 alu.dst.write = (j == 2); 3077 3078 alu.src[0].sel = tmp0; 3079 alu.src[0].chan = 0; 3080 if (signed_op) { 3081 alu.src[1].sel = tmp2; 3082 alu.src[1].chan = 1; 3083 } else { 3084 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3085 } 3086 3087 alu.last = (j == 3); 3088 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3089 return r; 3090 } 3091 } else { 3092 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3093 alu.op = ALU_OP2_MULLO_UINT; 3094 3095 alu.dst.sel = tmp0; 3096 alu.dst.chan = 2; 3097 alu.dst.write = 1; 3098 3099 alu.src[0].sel = tmp0; 3100 alu.src[0].chan = 0; 3101 if (signed_op) { 3102 alu.src[1].sel = tmp2; 3103 alu.src[1].chan = 1; 3104 } else { 3105 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3106 } 3107 3108 alu.last = 1; 3109 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3110 return r; 3111 } 3112 3113 /* 3. tmp0.w = -tmp0.z */ 3114 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3115 alu.op = ALU_OP2_SUB_INT; 3116 3117 alu.dst.sel = tmp0; 3118 alu.dst.chan = 3; 3119 alu.dst.write = 1; 3120 3121 alu.src[0].sel = V_SQ_ALU_SRC_0; 3122 alu.src[1].sel = tmp0; 3123 alu.src[1].chan = 2; 3124 3125 alu.last = 1; 3126 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3127 return r; 3128 3129 /* 4. tmp0.y = hi (tmp0.x * src2) */ 3130 if (ctx->bc->chip_class == CAYMAN) { 3131 for (j = 0 ; j < 4; j++) { 3132 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3133 alu.op = ALU_OP2_MULHI_UINT; 3134 3135 alu.dst.sel = tmp0; 3136 alu.dst.chan = j; 3137 alu.dst.write = (j == 1); 3138 3139 alu.src[0].sel = tmp0; 3140 alu.src[0].chan = 0; 3141 3142 if (signed_op) { 3143 alu.src[1].sel = tmp2; 3144 alu.src[1].chan = 1; 3145 } else { 3146 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3147 } 3148 alu.last = (j == 3); 3149 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3150 return r; 3151 } 3152 } else { 3153 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3154 alu.op = ALU_OP2_MULHI_UINT; 3155 3156 alu.dst.sel = tmp0; 3157 alu.dst.chan = 1; 3158 alu.dst.write = 1; 3159 3160 alu.src[0].sel = tmp0; 3161 alu.src[0].chan = 0; 3162 3163 if (signed_op) { 3164 alu.src[1].sel = tmp2; 3165 alu.src[1].chan = 1; 3166 } else { 3167 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3168 } 3169 3170 alu.last = 1; 3171 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3172 return r; 3173 } 3174 3175 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 3176 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3177 alu.op = ALU_OP3_CNDE_INT; 3178 alu.is_op3 = 1; 3179 3180 alu.dst.sel = tmp0; 3181 alu.dst.chan = 2; 3182 alu.dst.write = 1; 3183 3184 alu.src[0].sel = tmp0; 3185 alu.src[0].chan = 1; 3186 alu.src[1].sel = tmp0; 3187 alu.src[1].chan = 3; 3188 alu.src[2].sel = tmp0; 3189 alu.src[2].chan = 2; 3190 3191 alu.last = 1; 3192 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3193 return r; 3194 3195 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 3196 if (ctx->bc->chip_class == CAYMAN) { 3197 for (j = 0 ; j < 4; j++) { 3198 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3199 alu.op = ALU_OP2_MULHI_UINT; 3200 3201 alu.dst.sel = tmp0; 3202 alu.dst.chan = j; 3203 alu.dst.write = (j == 3); 3204 3205 alu.src[0].sel = tmp0; 3206 alu.src[0].chan = 2; 3207 3208 alu.src[1].sel = tmp0; 3209 alu.src[1].chan = 0; 3210 3211 alu.last = (j == 3); 3212 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3213 return r; 3214 } 3215 } else { 3216 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3217 alu.op = ALU_OP2_MULHI_UINT; 3218 3219 alu.dst.sel = tmp0; 3220 alu.dst.chan = 3; 3221 alu.dst.write = 1; 3222 3223 alu.src[0].sel = tmp0; 3224 alu.src[0].chan = 2; 3225 3226 alu.src[1].sel = tmp0; 3227 alu.src[1].chan = 0; 3228 3229 alu.last = 1; 3230 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3231 return r; 3232 } 3233 3234 /* 7. tmp1.x = tmp0.x - tmp0.w */ 3235 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3236 alu.op = ALU_OP2_SUB_INT; 3237 3238 alu.dst.sel = tmp1; 3239 alu.dst.chan = 0; 3240 alu.dst.write = 1; 3241 3242 alu.src[0].sel = tmp0; 3243 alu.src[0].chan = 0; 3244 alu.src[1].sel = tmp0; 3245 alu.src[1].chan = 3; 3246 3247 alu.last = 1; 3248 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3249 return r; 3250 3251 /* 8. tmp1.y = tmp0.x + tmp0.w */ 3252 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3253 alu.op = ALU_OP2_ADD_INT; 3254 3255 alu.dst.sel = tmp1; 3256 alu.dst.chan = 1; 3257 alu.dst.write = 1; 3258 3259 alu.src[0].sel = tmp0; 3260 alu.src[0].chan = 0; 3261 alu.src[1].sel = tmp0; 3262 alu.src[1].chan = 3; 3263 3264 alu.last = 1; 3265 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3266 return r; 3267 3268 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 3269 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3270 alu.op = ALU_OP3_CNDE_INT; 3271 alu.is_op3 = 1; 3272 3273 alu.dst.sel = tmp0; 3274 alu.dst.chan = 0; 3275 alu.dst.write = 1; 3276 3277 alu.src[0].sel = tmp0; 3278 alu.src[0].chan = 1; 3279 alu.src[1].sel = tmp1; 3280 alu.src[1].chan = 1; 3281 alu.src[2].sel = tmp1; 3282 alu.src[2].chan = 0; 3283 3284 alu.last = 1; 3285 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3286 return r; 3287 3288 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 3289 if (ctx->bc->chip_class == CAYMAN) { 3290 for (j = 0 ; j < 4; j++) { 3291 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3292 alu.op = ALU_OP2_MULHI_UINT; 3293 3294 alu.dst.sel = tmp0; 3295 alu.dst.chan = j; 3296 alu.dst.write = (j == 2); 3297 3298 alu.src[0].sel = tmp0; 3299 alu.src[0].chan = 0; 3300 3301 if (signed_op) { 3302 alu.src[1].sel = tmp2; 3303 alu.src[1].chan = 0; 3304 } else { 3305 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3306 } 3307 3308 alu.last = (j == 3); 3309 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3310 return r; 3311 } 3312 } else { 3313 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3314 alu.op = ALU_OP2_MULHI_UINT; 3315 3316 alu.dst.sel = tmp0; 3317 alu.dst.chan = 2; 3318 alu.dst.write = 1; 3319 3320 alu.src[0].sel = tmp0; 3321 alu.src[0].chan = 0; 3322 3323 if (signed_op) { 3324 alu.src[1].sel = tmp2; 3325 alu.src[1].chan = 0; 3326 } else { 3327 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3328 } 3329 3330 alu.last = 1; 3331 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3332 return r; 3333 } 3334 3335 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 3336 if (ctx->bc->chip_class == CAYMAN) { 3337 for (j = 0 ; j < 4; j++) { 3338 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3339 alu.op = ALU_OP2_MULLO_UINT; 3340 3341 alu.dst.sel = tmp0; 3342 alu.dst.chan = j; 3343 alu.dst.write = (j == 1); 3344 3345 if (signed_op) { 3346 alu.src[0].sel = tmp2; 3347 alu.src[0].chan = 1; 3348 } else { 3349 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 3350 } 3351 3352 alu.src[1].sel = tmp0; 3353 alu.src[1].chan = 2; 3354 3355 alu.last = (j == 3); 3356 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3357 return r; 3358 } 3359 } else { 3360 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3361 alu.op = ALU_OP2_MULLO_UINT; 3362 3363 alu.dst.sel = tmp0; 3364 alu.dst.chan = 1; 3365 alu.dst.write = 1; 3366 3367 if (signed_op) { 3368 alu.src[0].sel = tmp2; 3369 alu.src[0].chan = 1; 3370 } else { 3371 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 3372 } 3373 3374 alu.src[1].sel = tmp0; 3375 alu.src[1].chan = 2; 3376 3377 alu.last = 1; 3378 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3379 return r; 3380 } 3381 3382 /* 12. tmp0.w = src1 - tmp0.y = r */ 3383 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3384 alu.op = ALU_OP2_SUB_INT; 3385 3386 alu.dst.sel = tmp0; 3387 alu.dst.chan = 3; 3388 alu.dst.write = 1; 3389 3390 if (signed_op) { 3391 alu.src[0].sel = tmp2; 3392 alu.src[0].chan = 0; 3393 } else { 3394 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3395 } 3396 3397 alu.src[1].sel = tmp0; 3398 alu.src[1].chan = 1; 3399 3400 alu.last = 1; 3401 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3402 return r; 3403 3404 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 3405 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3406 alu.op = ALU_OP2_SETGE_UINT; 3407 3408 alu.dst.sel = tmp1; 3409 alu.dst.chan = 0; 3410 alu.dst.write = 1; 3411 3412 alu.src[0].sel = tmp0; 3413 alu.src[0].chan = 3; 3414 if (signed_op) { 3415 alu.src[1].sel = tmp2; 3416 alu.src[1].chan = 1; 3417 } else { 3418 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3419 } 3420 3421 alu.last = 1; 3422 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3423 return r; 3424 3425 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 3426 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3427 alu.op = ALU_OP2_SETGE_UINT; 3428 3429 alu.dst.sel = tmp1; 3430 alu.dst.chan = 1; 3431 alu.dst.write = 1; 3432 3433 if (signed_op) { 3434 alu.src[0].sel = tmp2; 3435 alu.src[0].chan = 0; 3436 } else { 3437 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3438 } 3439 3440 alu.src[1].sel = tmp0; 3441 alu.src[1].chan = 1; 3442 3443 alu.last = 1; 3444 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3445 return r; 3446 3447 if (mod) { /* UMOD */ 3448 3449 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 3450 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3451 alu.op = ALU_OP2_SUB_INT; 3452 3453 alu.dst.sel = tmp1; 3454 alu.dst.chan = 2; 3455 alu.dst.write = 1; 3456 3457 alu.src[0].sel = tmp0; 3458 alu.src[0].chan = 3; 3459 3460 if (signed_op) { 3461 alu.src[1].sel = tmp2; 3462 alu.src[1].chan = 1; 3463 } else { 3464 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3465 } 3466 3467 alu.last = 1; 3468 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3469 return r; 3470 3471 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 3472 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3473 alu.op = ALU_OP2_ADD_INT; 3474 3475 alu.dst.sel = tmp1; 3476 alu.dst.chan = 3; 3477 alu.dst.write = 1; 3478 3479 alu.src[0].sel = tmp0; 3480 alu.src[0].chan = 3; 3481 if (signed_op) { 3482 alu.src[1].sel = tmp2; 3483 alu.src[1].chan = 1; 3484 } else { 3485 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3486 } 3487 3488 alu.last = 1; 3489 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3490 return r; 3491 3492 } else { /* UDIV */ 3493 3494 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 3495 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3496 alu.op = ALU_OP2_ADD_INT; 3497 3498 alu.dst.sel = tmp1; 3499 alu.dst.chan = 2; 3500 alu.dst.write = 1; 3501 3502 alu.src[0].sel = tmp0; 3503 alu.src[0].chan = 2; 3504 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 3505 3506 alu.last = 1; 3507 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3508 return r; 3509 3510 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 3511 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3512 alu.op = ALU_OP2_ADD_INT; 3513 3514 alu.dst.sel = tmp1; 3515 alu.dst.chan = 3; 3516 alu.dst.write = 1; 3517 3518 alu.src[0].sel = tmp0; 3519 alu.src[0].chan = 2; 3520 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 3521 3522 alu.last = 1; 3523 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3524 return r; 3525 3526 } 3527 3528 /* 17. tmp1.x = tmp1.x & tmp1.y */ 3529 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3530 alu.op = ALU_OP2_AND_INT; 3531 3532 alu.dst.sel = tmp1; 3533 alu.dst.chan = 0; 3534 alu.dst.write = 1; 3535 3536 alu.src[0].sel = tmp1; 3537 alu.src[0].chan = 0; 3538 alu.src[1].sel = tmp1; 3539 alu.src[1].chan = 1; 3540 3541 alu.last = 1; 3542 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3543 return r; 3544 3545 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 3546 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 3547 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3548 alu.op = ALU_OP3_CNDE_INT; 3549 alu.is_op3 = 1; 3550 3551 alu.dst.sel = tmp0; 3552 alu.dst.chan = 2; 3553 alu.dst.write = 1; 3554 3555 alu.src[0].sel = tmp1; 3556 alu.src[0].chan = 0; 3557 alu.src[1].sel = tmp0; 3558 alu.src[1].chan = mod ? 3 : 2; 3559 alu.src[2].sel = tmp1; 3560 alu.src[2].chan = 2; 3561 3562 alu.last = 1; 3563 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3564 return r; 3565 3566 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 3567 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3568 alu.op = ALU_OP3_CNDE_INT; 3569 alu.is_op3 = 1; 3570 3571 if (signed_op) { 3572 alu.dst.sel = tmp0; 3573 alu.dst.chan = 2; 3574 alu.dst.write = 1; 3575 } else { 3576 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3577 } 3578 3579 alu.src[0].sel = tmp1; 3580 alu.src[0].chan = 1; 3581 alu.src[1].sel = tmp1; 3582 alu.src[1].chan = 3; 3583 alu.src[2].sel = tmp0; 3584 alu.src[2].chan = 2; 3585 3586 alu.last = 1; 3587 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3588 return r; 3589 3590 if (signed_op) { 3591 3592 /* fix the sign of the result */ 3593 3594 if (mod) { 3595 3596 /* tmp0.x = -tmp0.z */ 3597 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3598 alu.op = ALU_OP2_SUB_INT; 3599 3600 alu.dst.sel = tmp0; 3601 alu.dst.chan = 0; 3602 alu.dst.write = 1; 3603 3604 alu.src[0].sel = V_SQ_ALU_SRC_0; 3605 alu.src[1].sel = tmp0; 3606 alu.src[1].chan = 2; 3607 3608 alu.last = 1; 3609 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3610 return r; 3611 3612 /* sign of the remainder is the same as the sign of src0 */ 3613 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 3614 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3615 alu.op = ALU_OP3_CNDGE_INT; 3616 alu.is_op3 = 1; 3617 3618 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3619 3620 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3621 alu.src[1].sel = tmp0; 3622 alu.src[1].chan = 2; 3623 alu.src[2].sel = tmp0; 3624 alu.src[2].chan = 0; 3625 3626 alu.last = 1; 3627 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3628 return r; 3629 3630 } else { 3631 3632 /* tmp0.x = -tmp0.z */ 3633 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3634 alu.op = ALU_OP2_SUB_INT; 3635 3636 alu.dst.sel = tmp0; 3637 alu.dst.chan = 0; 3638 alu.dst.write = 1; 3639 3640 alu.src[0].sel = V_SQ_ALU_SRC_0; 3641 alu.src[1].sel = tmp0; 3642 alu.src[1].chan = 2; 3643 3644 alu.last = 1; 3645 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3646 return r; 3647 3648 /* fix the quotient sign (same as the sign of src0*src1) */ 3649 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 3650 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3651 alu.op = ALU_OP3_CNDGE_INT; 3652 alu.is_op3 = 1; 3653 3654 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3655 3656 alu.src[0].sel = tmp2; 3657 alu.src[0].chan = 2; 3658 alu.src[1].sel = tmp0; 3659 alu.src[1].chan = 2; 3660 alu.src[2].sel = tmp0; 3661 alu.src[2].chan = 0; 3662 3663 alu.last = 1; 3664 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3665 return r; 3666 } 3667 } 3668 } 3669 return 0; 3670} 3671 3672static int tgsi_udiv(struct r600_shader_ctx *ctx) 3673{ 3674 return tgsi_divmod(ctx, 0, 0); 3675} 3676 3677static int tgsi_umod(struct r600_shader_ctx *ctx) 3678{ 3679 return tgsi_divmod(ctx, 1, 0); 3680} 3681 3682static int tgsi_idiv(struct r600_shader_ctx *ctx) 3683{ 3684 return tgsi_divmod(ctx, 0, 1); 3685} 3686 3687static int tgsi_imod(struct r600_shader_ctx *ctx) 3688{ 3689 return tgsi_divmod(ctx, 1, 1); 3690} 3691 3692 3693static int tgsi_f2i(struct r600_shader_ctx *ctx) 3694{ 3695 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3696 struct r600_bytecode_alu alu; 3697 int i, r; 3698 unsigned write_mask = inst->Dst[0].Register.WriteMask; 3699 int last_inst = tgsi_last_instruction(write_mask); 3700 3701 for (i = 0; i < 4; i++) { 3702 if (!(write_mask & (1<<i))) 3703 continue; 3704 3705 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3706 alu.op = ALU_OP1_TRUNC; 3707 3708 alu.dst.sel = ctx->temp_reg; 3709 alu.dst.chan = i; 3710 alu.dst.write = 1; 3711 3712 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3713 if (i == last_inst) 3714 alu.last = 1; 3715 r = r600_bytecode_add_alu(ctx->bc, &alu); 3716 if (r) 3717 return r; 3718 } 3719 3720 for (i = 0; i < 4; i++) { 3721 if (!(write_mask & (1<<i))) 3722 continue; 3723 3724 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3725 alu.op = ctx->inst_info->op; 3726 3727 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3728 3729 alu.src[0].sel = ctx->temp_reg; 3730 alu.src[0].chan = i; 3731 3732 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 3733 alu.last = 1; 3734 r = r600_bytecode_add_alu(ctx->bc, &alu); 3735 if (r) 3736 return r; 3737 } 3738 3739 return 0; 3740} 3741 3742static int tgsi_iabs(struct r600_shader_ctx *ctx) 3743{ 3744 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3745 struct r600_bytecode_alu alu; 3746 int i, r; 3747 unsigned write_mask = inst->Dst[0].Register.WriteMask; 3748 int last_inst = tgsi_last_instruction(write_mask); 3749 3750 /* tmp = -src */ 3751 for (i = 0; i < 4; i++) { 3752 if (!(write_mask & (1<<i))) 3753 continue; 3754 3755 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3756 alu.op = ALU_OP2_SUB_INT; 3757 3758 alu.dst.sel = ctx->temp_reg; 3759 alu.dst.chan = i; 3760 alu.dst.write = 1; 3761 3762 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3763 alu.src[0].sel = V_SQ_ALU_SRC_0; 3764 3765 if (i == last_inst) 3766 alu.last = 1; 3767 r = r600_bytecode_add_alu(ctx->bc, &alu); 3768 if (r) 3769 return r; 3770 } 3771 3772 /* dst = (src >= 0 ? src : tmp) */ 3773 for (i = 0; i < 4; i++) { 3774 if (!(write_mask & (1<<i))) 3775 continue; 3776 3777 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3778 alu.op = ALU_OP3_CNDGE_INT; 3779 alu.is_op3 = 1; 3780 alu.dst.write = 1; 3781 3782 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3783 3784 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3785 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3786 alu.src[2].sel = ctx->temp_reg; 3787 alu.src[2].chan = i; 3788 3789 if (i == last_inst) 3790 alu.last = 1; 3791 r = r600_bytecode_add_alu(ctx->bc, &alu); 3792 if (r) 3793 return r; 3794 } 3795 return 0; 3796} 3797 3798static int tgsi_issg(struct r600_shader_ctx *ctx) 3799{ 3800 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3801 struct r600_bytecode_alu alu; 3802 int i, r; 3803 unsigned write_mask = inst->Dst[0].Register.WriteMask; 3804 int last_inst = tgsi_last_instruction(write_mask); 3805 3806 /* tmp = (src >= 0 ? src : -1) */ 3807 for (i = 0; i < 4; i++) { 3808 if (!(write_mask & (1<<i))) 3809 continue; 3810 3811 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3812 alu.op = ALU_OP3_CNDGE_INT; 3813 alu.is_op3 = 1; 3814 3815 alu.dst.sel = ctx->temp_reg; 3816 alu.dst.chan = i; 3817 alu.dst.write = 1; 3818 3819 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3820 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3821 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 3822 3823 if (i == last_inst) 3824 alu.last = 1; 3825 r = r600_bytecode_add_alu(ctx->bc, &alu); 3826 if (r) 3827 return r; 3828 } 3829 3830 /* dst = (tmp > 0 ? 1 : tmp) */ 3831 for (i = 0; i < 4; i++) { 3832 if (!(write_mask & (1<<i))) 3833 continue; 3834 3835 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3836 alu.op = ALU_OP3_CNDGT_INT; 3837 alu.is_op3 = 1; 3838 alu.dst.write = 1; 3839 3840 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3841 3842 alu.src[0].sel = ctx->temp_reg; 3843 alu.src[0].chan = i; 3844 3845 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 3846 3847 alu.src[2].sel = ctx->temp_reg; 3848 alu.src[2].chan = i; 3849 3850 if (i == last_inst) 3851 alu.last = 1; 3852 r = r600_bytecode_add_alu(ctx->bc, &alu); 3853 if (r) 3854 return r; 3855 } 3856 return 0; 3857} 3858 3859 3860 3861static int tgsi_ssg(struct r600_shader_ctx *ctx) 3862{ 3863 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3864 struct r600_bytecode_alu alu; 3865 int i, r; 3866 3867 /* tmp = (src > 0 ? 1 : src) */ 3868 for (i = 0; i < 4; i++) { 3869 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3870 alu.op = ALU_OP3_CNDGT; 3871 alu.is_op3 = 1; 3872 3873 alu.dst.sel = ctx->temp_reg; 3874 alu.dst.chan = i; 3875 3876 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3877 alu.src[1].sel = V_SQ_ALU_SRC_1; 3878 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 3879 3880 if (i == 3) 3881 alu.last = 1; 3882 r = r600_bytecode_add_alu(ctx->bc, &alu); 3883 if (r) 3884 return r; 3885 } 3886 3887 /* dst = (-tmp > 0 ? -1 : tmp) */ 3888 for (i = 0; i < 4; i++) { 3889 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3890 alu.op = ALU_OP3_CNDGT; 3891 alu.is_op3 = 1; 3892 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3893 3894 alu.src[0].sel = ctx->temp_reg; 3895 alu.src[0].chan = i; 3896 alu.src[0].neg = 1; 3897 3898 alu.src[1].sel = V_SQ_ALU_SRC_1; 3899 alu.src[1].neg = 1; 3900 3901 alu.src[2].sel = ctx->temp_reg; 3902 alu.src[2].chan = i; 3903 3904 if (i == 3) 3905 alu.last = 1; 3906 r = r600_bytecode_add_alu(ctx->bc, &alu); 3907 if (r) 3908 return r; 3909 } 3910 return 0; 3911} 3912 3913static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 3914{ 3915 struct r600_bytecode_alu alu; 3916 int i, r; 3917 3918 for (i = 0; i < 4; i++) { 3919 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3920 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 3921 alu.op = ALU_OP0_NOP; 3922 alu.dst.chan = i; 3923 } else { 3924 alu.op = ALU_OP1_MOV; 3925 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3926 alu.src[0].sel = ctx->temp_reg; 3927 alu.src[0].chan = i; 3928 } 3929 if (i == 3) { 3930 alu.last = 1; 3931 } 3932 r = r600_bytecode_add_alu(ctx->bc, &alu); 3933 if (r) 3934 return r; 3935 } 3936 return 0; 3937} 3938 3939static int tgsi_op3(struct r600_shader_ctx *ctx) 3940{ 3941 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3942 struct r600_bytecode_alu alu; 3943 int i, j, r; 3944 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 3945 3946 for (i = 0; i < lasti + 1; i++) { 3947 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 3948 continue; 3949 3950 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3951 alu.op = ctx->inst_info->op; 3952 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 3953 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 3954 } 3955 3956 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3957 alu.dst.chan = i; 3958 alu.dst.write = 1; 3959 alu.is_op3 = 1; 3960 if (i == lasti) { 3961 alu.last = 1; 3962 } 3963 r = r600_bytecode_add_alu(ctx->bc, &alu); 3964 if (r) 3965 return r; 3966 } 3967 return 0; 3968} 3969 3970static int tgsi_dp(struct r600_shader_ctx *ctx) 3971{ 3972 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3973 struct r600_bytecode_alu alu; 3974 int i, j, r; 3975 3976 for (i = 0; i < 4; i++) { 3977 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3978 alu.op = ctx->inst_info->op; 3979 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 3980 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 3981 } 3982 3983 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3984 alu.dst.chan = i; 3985 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 3986 /* handle some special cases */ 3987 switch (ctx->inst_info->tgsi_opcode) { 3988 case TGSI_OPCODE_DP2: 3989 if (i > 1) { 3990 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 3991 alu.src[0].chan = alu.src[1].chan = 0; 3992 } 3993 break; 3994 case TGSI_OPCODE_DP3: 3995 if (i > 2) { 3996 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 3997 alu.src[0].chan = alu.src[1].chan = 0; 3998 } 3999 break; 4000 case TGSI_OPCODE_DPH: 4001 if (i == 3) { 4002 alu.src[0].sel = V_SQ_ALU_SRC_1; 4003 alu.src[0].chan = 0; 4004 alu.src[0].neg = 0; 4005 } 4006 break; 4007 default: 4008 break; 4009 } 4010 if (i == 3) { 4011 alu.last = 1; 4012 } 4013 r = r600_bytecode_add_alu(ctx->bc, &alu); 4014 if (r) 4015 return r; 4016 } 4017 return 0; 4018} 4019 4020static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 4021 unsigned index) 4022{ 4023 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4024 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 4025 inst->Src[index].Register.File != TGSI_FILE_INPUT && 4026 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 4027 ctx->src[index].neg || ctx->src[index].abs; 4028} 4029 4030static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 4031 unsigned index) 4032{ 4033 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4034 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 4035} 4036 4037static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 4038{ 4039 struct r600_bytecode_vtx vtx; 4040 struct r600_bytecode_alu alu; 4041 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4042 int src_gpr, r, i; 4043 int id = tgsi_tex_get_src_gpr(ctx, 1); 4044 4045 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 4046 if (src_requires_loading) { 4047 for (i = 0; i < 4; i++) { 4048 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4049 alu.op = ALU_OP1_MOV; 4050 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4051 alu.dst.sel = ctx->temp_reg; 4052 alu.dst.chan = i; 4053 if (i == 3) 4054 alu.last = 1; 4055 alu.dst.write = 1; 4056 r = r600_bytecode_add_alu(ctx->bc, &alu); 4057 if (r) 4058 return r; 4059 } 4060 src_gpr = ctx->temp_reg; 4061 } 4062 4063 memset(&vtx, 0, sizeof(vtx)); 4064 vtx.op = FETCH_OP_VFETCH; 4065 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 4066 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */ 4067 vtx.src_gpr = src_gpr; 4068 vtx.mega_fetch_count = 16; 4069 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 4070 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 4071 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 4072 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 4073 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 4074 vtx.use_const_fields = 1; 4075 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 4076 4077 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 4078 return r; 4079 4080 if (ctx->bc->chip_class >= EVERGREEN) 4081 return 0; 4082 4083 for (i = 0; i < 4; i++) { 4084 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4085 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4086 continue; 4087 4088 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4089 alu.op = ALU_OP2_AND_INT; 4090 4091 alu.dst.chan = i; 4092 alu.dst.sel = vtx.dst_gpr; 4093 alu.dst.write = 1; 4094 4095 alu.src[0].sel = vtx.dst_gpr; 4096 alu.src[0].chan = i; 4097 4098 alu.src[1].sel = 512 + (id * 2); 4099 alu.src[1].chan = i % 4; 4100 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4101 4102 if (i == lasti) 4103 alu.last = 1; 4104 r = r600_bytecode_add_alu(ctx->bc, &alu); 4105 if (r) 4106 return r; 4107 } 4108 4109 if (inst->Dst[0].Register.WriteMask & 3) { 4110 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4111 alu.op = ALU_OP2_OR_INT; 4112 4113 alu.dst.chan = 3; 4114 alu.dst.sel = vtx.dst_gpr; 4115 alu.dst.write = 1; 4116 4117 alu.src[0].sel = vtx.dst_gpr; 4118 alu.src[0].chan = 3; 4119 4120 alu.src[1].sel = 512 + (id * 2) + 1; 4121 alu.src[1].chan = 0; 4122 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4123 4124 alu.last = 1; 4125 r = r600_bytecode_add_alu(ctx->bc, &alu); 4126 if (r) 4127 return r; 4128 } 4129 return 0; 4130} 4131 4132static int r600_do_buffer_txq(struct r600_shader_ctx *ctx) 4133{ 4134 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4135 struct r600_bytecode_alu alu; 4136 int r; 4137 int id = tgsi_tex_get_src_gpr(ctx, 1); 4138 4139 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4140 alu.op = ALU_OP1_MOV; 4141 4142 if (ctx->bc->chip_class >= EVERGREEN) { 4143 alu.src[0].sel = 512 + (id / 4); 4144 alu.src[0].chan = id % 4; 4145 } else { 4146 /* r600 we have them at channel 2 of the second dword */ 4147 alu.src[0].sel = 512 + (id * 2) + 1; 4148 alu.src[0].chan = 1; 4149 } 4150 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4151 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 4152 alu.last = 1; 4153 r = r600_bytecode_add_alu(ctx->bc, &alu); 4154 if (r) 4155 return r; 4156 return 0; 4157} 4158 4159static int tgsi_tex(struct r600_shader_ctx *ctx) 4160{ 4161 static float one_point_five = 1.5f; 4162 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4163 struct r600_bytecode_tex tex; 4164 struct r600_bytecode_alu alu; 4165 unsigned src_gpr; 4166 int r, i, j; 4167 int opcode; 4168 bool read_compressed_msaa = ctx->bc->msaa_texture_mode == MSAA_TEXTURE_COMPRESSED && 4169 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 4170 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 4171 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 4172 /* Texture fetch instructions can only use gprs as source. 4173 * Also they cannot negate the source or take the absolute value */ 4174 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ && 4175 tgsi_tex_src_requires_loading(ctx, 0)) || 4176 read_compressed_msaa; 4177 boolean src_loaded = FALSE; 4178 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1; 4179 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 4180 boolean has_txq_cube_array_z = false; 4181 4182 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 4183 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4184 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 4185 if (inst->Dst[0].Register.WriteMask & 4) { 4186 ctx->shader->has_txq_cube_array_z_comp = true; 4187 has_txq_cube_array_z = true; 4188 } 4189 4190 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 4191 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 4192 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 4193 sampler_src_reg = 2; 4194 4195 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 4196 4197 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 4198 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 4199 ctx->shader->uses_tex_buffers = true; 4200 return r600_do_buffer_txq(ctx); 4201 } 4202 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 4203 if (ctx->bc->chip_class < EVERGREEN) 4204 ctx->shader->uses_tex_buffers = true; 4205 return do_vtx_fetch_inst(ctx, src_requires_loading); 4206 } 4207 } 4208 4209 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 4210 /* get offset values */ 4211 if (inst->Texture.NumOffsets) { 4212 assert(inst->Texture.NumOffsets == 1); 4213 4214 offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 4215 offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 4216 offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 4217 } 4218 } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 4219 /* TGSI moves the sampler to src reg 3 for TXD */ 4220 sampler_src_reg = 3; 4221 4222 for (i = 1; i < 3; i++) { 4223 /* set gradients h/v */ 4224 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4225 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 4226 FETCH_OP_SET_GRADIENTS_V; 4227 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4228 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4229 4230 if (tgsi_tex_src_requires_loading(ctx, i)) { 4231 tex.src_gpr = r600_get_temp(ctx); 4232 tex.src_sel_x = 0; 4233 tex.src_sel_y = 1; 4234 tex.src_sel_z = 2; 4235 tex.src_sel_w = 3; 4236 4237 for (j = 0; j < 4; j++) { 4238 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4239 alu.op = ALU_OP1_MOV; 4240 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 4241 alu.dst.sel = tex.src_gpr; 4242 alu.dst.chan = j; 4243 if (j == 3) 4244 alu.last = 1; 4245 alu.dst.write = 1; 4246 r = r600_bytecode_add_alu(ctx->bc, &alu); 4247 if (r) 4248 return r; 4249 } 4250 4251 } else { 4252 tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i); 4253 tex.src_sel_x = ctx->src[i].swizzle[0]; 4254 tex.src_sel_y = ctx->src[i].swizzle[1]; 4255 tex.src_sel_z = ctx->src[i].swizzle[2]; 4256 tex.src_sel_w = ctx->src[i].swizzle[3]; 4257 tex.src_rel = ctx->src[i].rel; 4258 } 4259 tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */ 4260 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 4261 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 4262 tex.coord_type_x = 1; 4263 tex.coord_type_y = 1; 4264 tex.coord_type_z = 1; 4265 tex.coord_type_w = 1; 4266 } 4267 r = r600_bytecode_add_tex(ctx->bc, &tex); 4268 if (r) 4269 return r; 4270 } 4271 } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 4272 int out_chan; 4273 /* Add perspective divide */ 4274 if (ctx->bc->chip_class == CAYMAN) { 4275 out_chan = 2; 4276 for (i = 0; i < 3; i++) { 4277 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4278 alu.op = ALU_OP1_RECIP_IEEE; 4279 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4280 4281 alu.dst.sel = ctx->temp_reg; 4282 alu.dst.chan = i; 4283 if (i == 2) 4284 alu.last = 1; 4285 if (out_chan == i) 4286 alu.dst.write = 1; 4287 r = r600_bytecode_add_alu(ctx->bc, &alu); 4288 if (r) 4289 return r; 4290 } 4291 4292 } else { 4293 out_chan = 3; 4294 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4295 alu.op = ALU_OP1_RECIP_IEEE; 4296 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4297 4298 alu.dst.sel = ctx->temp_reg; 4299 alu.dst.chan = out_chan; 4300 alu.last = 1; 4301 alu.dst.write = 1; 4302 r = r600_bytecode_add_alu(ctx->bc, &alu); 4303 if (r) 4304 return r; 4305 } 4306 4307 for (i = 0; i < 3; i++) { 4308 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4309 alu.op = ALU_OP2_MUL; 4310 alu.src[0].sel = ctx->temp_reg; 4311 alu.src[0].chan = out_chan; 4312 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4313 alu.dst.sel = ctx->temp_reg; 4314 alu.dst.chan = i; 4315 alu.dst.write = 1; 4316 r = r600_bytecode_add_alu(ctx->bc, &alu); 4317 if (r) 4318 return r; 4319 } 4320 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4321 alu.op = ALU_OP1_MOV; 4322 alu.src[0].sel = V_SQ_ALU_SRC_1; 4323 alu.src[0].chan = 0; 4324 alu.dst.sel = ctx->temp_reg; 4325 alu.dst.chan = 3; 4326 alu.last = 1; 4327 alu.dst.write = 1; 4328 r = r600_bytecode_add_alu(ctx->bc, &alu); 4329 if (r) 4330 return r; 4331 src_loaded = TRUE; 4332 src_gpr = ctx->temp_reg; 4333 } 4334 4335 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 4336 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4337 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4338 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 4339 inst->Instruction.Opcode != TGSI_OPCODE_TXQ && 4340 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { 4341 4342 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 4343 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 4344 4345 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 4346 for (i = 0; i < 4; i++) { 4347 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4348 alu.op = ALU_OP2_CUBE; 4349 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 4350 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 4351 alu.dst.sel = ctx->temp_reg; 4352 alu.dst.chan = i; 4353 if (i == 3) 4354 alu.last = 1; 4355 alu.dst.write = 1; 4356 r = r600_bytecode_add_alu(ctx->bc, &alu); 4357 if (r) 4358 return r; 4359 } 4360 4361 /* tmp1.z = RCP_e(|tmp1.z|) */ 4362 if (ctx->bc->chip_class == CAYMAN) { 4363 for (i = 0; i < 3; i++) { 4364 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4365 alu.op = ALU_OP1_RECIP_IEEE; 4366 alu.src[0].sel = ctx->temp_reg; 4367 alu.src[0].chan = 2; 4368 alu.src[0].abs = 1; 4369 alu.dst.sel = ctx->temp_reg; 4370 alu.dst.chan = i; 4371 if (i == 2) 4372 alu.dst.write = 1; 4373 if (i == 2) 4374 alu.last = 1; 4375 r = r600_bytecode_add_alu(ctx->bc, &alu); 4376 if (r) 4377 return r; 4378 } 4379 } else { 4380 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4381 alu.op = ALU_OP1_RECIP_IEEE; 4382 alu.src[0].sel = ctx->temp_reg; 4383 alu.src[0].chan = 2; 4384 alu.src[0].abs = 1; 4385 alu.dst.sel = ctx->temp_reg; 4386 alu.dst.chan = 2; 4387 alu.dst.write = 1; 4388 alu.last = 1; 4389 r = r600_bytecode_add_alu(ctx->bc, &alu); 4390 if (r) 4391 return r; 4392 } 4393 4394 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 4395 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 4396 * muladd has no writemask, have to use another temp 4397 */ 4398 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4399 alu.op = ALU_OP3_MULADD; 4400 alu.is_op3 = 1; 4401 4402 alu.src[0].sel = ctx->temp_reg; 4403 alu.src[0].chan = 0; 4404 alu.src[1].sel = ctx->temp_reg; 4405 alu.src[1].chan = 2; 4406 4407 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 4408 alu.src[2].chan = 0; 4409 alu.src[2].value = *(uint32_t *)&one_point_five; 4410 4411 alu.dst.sel = ctx->temp_reg; 4412 alu.dst.chan = 0; 4413 alu.dst.write = 1; 4414 4415 r = r600_bytecode_add_alu(ctx->bc, &alu); 4416 if (r) 4417 return r; 4418 4419 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4420 alu.op = ALU_OP3_MULADD; 4421 alu.is_op3 = 1; 4422 4423 alu.src[0].sel = ctx->temp_reg; 4424 alu.src[0].chan = 1; 4425 alu.src[1].sel = ctx->temp_reg; 4426 alu.src[1].chan = 2; 4427 4428 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 4429 alu.src[2].chan = 0; 4430 alu.src[2].value = *(uint32_t *)&one_point_five; 4431 4432 alu.dst.sel = ctx->temp_reg; 4433 alu.dst.chan = 1; 4434 alu.dst.write = 1; 4435 4436 alu.last = 1; 4437 r = r600_bytecode_add_alu(ctx->bc, &alu); 4438 if (r) 4439 return r; 4440 /* write initial compare value into Z component 4441 - W src 0 for shadow cube 4442 - X src 1 for shadow cube array */ 4443 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4444 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4445 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4446 alu.op = ALU_OP1_MOV; 4447 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 4448 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 4449 else 4450 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4451 alu.dst.sel = ctx->temp_reg; 4452 alu.dst.chan = 2; 4453 alu.dst.write = 1; 4454 alu.last = 1; 4455 r = r600_bytecode_add_alu(ctx->bc, &alu); 4456 if (r) 4457 return r; 4458 } 4459 4460 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4461 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4462 if (ctx->bc->chip_class >= EVERGREEN) { 4463 int mytmp = r600_get_temp(ctx); 4464 static const float eight = 8.0f; 4465 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4466 alu.op = ALU_OP1_MOV; 4467 alu.src[0].sel = ctx->temp_reg; 4468 alu.src[0].chan = 3; 4469 alu.dst.sel = mytmp; 4470 alu.dst.chan = 0; 4471 alu.dst.write = 1; 4472 alu.last = 1; 4473 r = r600_bytecode_add_alu(ctx->bc, &alu); 4474 if (r) 4475 return r; 4476 4477 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 4478 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4479 alu.op = ALU_OP3_MULADD; 4480 alu.is_op3 = 1; 4481 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4482 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4483 alu.src[1].chan = 0; 4484 alu.src[1].value = *(uint32_t *)&eight; 4485 alu.src[2].sel = mytmp; 4486 alu.src[2].chan = 0; 4487 alu.dst.sel = ctx->temp_reg; 4488 alu.dst.chan = 3; 4489 alu.dst.write = 1; 4490 alu.last = 1; 4491 r = r600_bytecode_add_alu(ctx->bc, &alu); 4492 if (r) 4493 return r; 4494 } else if (ctx->bc->chip_class < EVERGREEN) { 4495 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4496 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 4497 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4498 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4499 tex.src_gpr = r600_get_temp(ctx); 4500 tex.src_sel_x = 0; 4501 tex.src_sel_y = 0; 4502 tex.src_sel_z = 0; 4503 tex.src_sel_w = 0; 4504 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 4505 tex.coord_type_x = 1; 4506 tex.coord_type_y = 1; 4507 tex.coord_type_z = 1; 4508 tex.coord_type_w = 1; 4509 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4510 alu.op = ALU_OP1_MOV; 4511 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4512 alu.dst.sel = tex.src_gpr; 4513 alu.dst.chan = 0; 4514 alu.last = 1; 4515 alu.dst.write = 1; 4516 r = r600_bytecode_add_alu(ctx->bc, &alu); 4517 if (r) 4518 return r; 4519 4520 r = r600_bytecode_add_tex(ctx->bc, &tex); 4521 if (r) 4522 return r; 4523 } 4524 4525 } 4526 4527 /* for cube forms of lod and bias we need to route things */ 4528 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 4529 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 4530 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 4531 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 4532 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4533 alu.op = ALU_OP1_MOV; 4534 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 4535 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 4536 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 4537 else 4538 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4539 alu.dst.sel = ctx->temp_reg; 4540 alu.dst.chan = 2; 4541 alu.last = 1; 4542 alu.dst.write = 1; 4543 r = r600_bytecode_add_alu(ctx->bc, &alu); 4544 if (r) 4545 return r; 4546 } 4547 4548 src_loaded = TRUE; 4549 src_gpr = ctx->temp_reg; 4550 } 4551 4552 if (src_requires_loading && !src_loaded) { 4553 for (i = 0; i < 4; i++) { 4554 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4555 alu.op = ALU_OP1_MOV; 4556 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4557 alu.dst.sel = ctx->temp_reg; 4558 alu.dst.chan = i; 4559 if (i == 3) 4560 alu.last = 1; 4561 alu.dst.write = 1; 4562 r = r600_bytecode_add_alu(ctx->bc, &alu); 4563 if (r) 4564 return r; 4565 } 4566 src_loaded = TRUE; 4567 src_gpr = ctx->temp_reg; 4568 } 4569 4570 /* Obtain the sample index for reading a compressed MSAA color texture. 4571 * To read the FMASK, we use the ldfptr instruction, which tells us 4572 * where the samples are stored. 4573 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 4574 * which is the identity mapping. Each nibble says which physical sample 4575 * should be fetched to get that sample. 4576 * 4577 * Assume src.z contains the sample index. It should be modified like this: 4578 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 4579 * Then fetch the texel with src. 4580 */ 4581 if (read_compressed_msaa) { 4582 unsigned sample_chan = 3; 4583 unsigned temp = r600_get_temp(ctx); 4584 assert(src_loaded); 4585 4586 /* temp.w = ldfptr() */ 4587 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4588 tex.op = FETCH_OP_LD; 4589 tex.inst_mod = 1; /* to indicate this is ldfptr */ 4590 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4591 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4592 tex.src_gpr = src_gpr; 4593 tex.dst_gpr = temp; 4594 tex.dst_sel_x = 7; /* mask out these components */ 4595 tex.dst_sel_y = 7; 4596 tex.dst_sel_z = 7; 4597 tex.dst_sel_w = 0; /* store X */ 4598 tex.src_sel_x = 0; 4599 tex.src_sel_y = 1; 4600 tex.src_sel_z = 2; 4601 tex.src_sel_w = 3; 4602 tex.offset_x = offset_x; 4603 tex.offset_y = offset_y; 4604 tex.offset_z = offset_z; 4605 r = r600_bytecode_add_tex(ctx->bc, &tex); 4606 if (r) 4607 return r; 4608 4609 /* temp.x = sample_index*4 */ 4610 if (ctx->bc->chip_class == CAYMAN) { 4611 for (i = 0 ; i < 4; i++) { 4612 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4613 alu.op = ALU_OP2_MULLO_INT; 4614 alu.src[0].sel = src_gpr; 4615 alu.src[0].chan = sample_chan; 4616 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4617 alu.src[1].value = 4; 4618 alu.dst.sel = temp; 4619 alu.dst.chan = i; 4620 alu.dst.write = i == 0; 4621 if (i == 3) 4622 alu.last = 1; 4623 r = r600_bytecode_add_alu(ctx->bc, &alu); 4624 if (r) 4625 return r; 4626 } 4627 } else { 4628 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4629 alu.op = ALU_OP2_MULLO_INT; 4630 alu.src[0].sel = src_gpr; 4631 alu.src[0].chan = sample_chan; 4632 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4633 alu.src[1].value = 4; 4634 alu.dst.sel = temp; 4635 alu.dst.chan = 0; 4636 alu.dst.write = 1; 4637 alu.last = 1; 4638 r = r600_bytecode_add_alu(ctx->bc, &alu); 4639 if (r) 4640 return r; 4641 } 4642 4643 /* sample_index = temp.w >> temp.x */ 4644 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4645 alu.op = ALU_OP2_LSHR_INT; 4646 alu.src[0].sel = temp; 4647 alu.src[0].chan = 3; 4648 alu.src[1].sel = temp; 4649 alu.src[1].chan = 0; 4650 alu.dst.sel = src_gpr; 4651 alu.dst.chan = sample_chan; 4652 alu.dst.write = 1; 4653 alu.last = 1; 4654 r = r600_bytecode_add_alu(ctx->bc, &alu); 4655 if (r) 4656 return r; 4657 4658 /* sample_index & 0xF */ 4659 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4660 alu.op = ALU_OP2_AND_INT; 4661 alu.src[0].sel = src_gpr; 4662 alu.src[0].chan = sample_chan; 4663 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4664 alu.src[1].value = 0xF; 4665 alu.dst.sel = src_gpr; 4666 alu.dst.chan = sample_chan; 4667 alu.dst.write = 1; 4668 alu.last = 1; 4669 r = r600_bytecode_add_alu(ctx->bc, &alu); 4670 if (r) 4671 return r; 4672#if 0 4673 /* visualize the FMASK */ 4674 for (i = 0; i < 4; i++) { 4675 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4676 alu.op = ALU_OP1_INT_TO_FLT; 4677 alu.src[0].sel = src_gpr; 4678 alu.src[0].chan = sample_chan; 4679 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 4680 alu.dst.chan = i; 4681 alu.dst.write = 1; 4682 alu.last = 1; 4683 r = r600_bytecode_add_alu(ctx->bc, &alu); 4684 if (r) 4685 return r; 4686 } 4687 return 0; 4688#endif 4689 } 4690 4691 /* does this shader want a num layers from TXQ for a cube array? */ 4692 if (has_txq_cube_array_z) { 4693 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4694 4695 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4696 alu.op = ALU_OP1_MOV; 4697 4698 alu.src[0].sel = 512 + (id / 4); 4699 alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER; 4700 alu.src[0].chan = id % 4; 4701 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 4702 alu.last = 1; 4703 r = r600_bytecode_add_alu(ctx->bc, &alu); 4704 if (r) 4705 return r; 4706 /* disable writemask from texture instruction */ 4707 inst->Dst[0].Register.WriteMask &= ~4; 4708 } 4709 4710 opcode = ctx->inst_info->op; 4711 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 4712 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 4713 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 4714 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4715 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 4716 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 4717 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4718 switch (opcode) { 4719 case FETCH_OP_SAMPLE: 4720 opcode = FETCH_OP_SAMPLE_C; 4721 break; 4722 case FETCH_OP_SAMPLE_L: 4723 opcode = FETCH_OP_SAMPLE_C_L; 4724 break; 4725 case FETCH_OP_SAMPLE_LB: 4726 opcode = FETCH_OP_SAMPLE_C_LB; 4727 break; 4728 case FETCH_OP_SAMPLE_G: 4729 opcode = FETCH_OP_SAMPLE_C_G; 4730 break; 4731 } 4732 } 4733 4734 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4735 tex.op = opcode; 4736 4737 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4738 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4739 tex.src_gpr = src_gpr; 4740 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 4741 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 4742 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 4743 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 4744 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 4745 4746 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) { 4747 tex.src_sel_x = 4; 4748 tex.src_sel_y = 4; 4749 tex.src_sel_z = 4; 4750 tex.src_sel_w = 4; 4751 } else if (src_loaded) { 4752 tex.src_sel_x = 0; 4753 tex.src_sel_y = 1; 4754 tex.src_sel_z = 2; 4755 tex.src_sel_w = 3; 4756 } else { 4757 tex.src_sel_x = ctx->src[0].swizzle[0]; 4758 tex.src_sel_y = ctx->src[0].swizzle[1]; 4759 tex.src_sel_z = ctx->src[0].swizzle[2]; 4760 tex.src_sel_w = ctx->src[0].swizzle[3]; 4761 tex.src_rel = ctx->src[0].rel; 4762 } 4763 4764 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 4765 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4766 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4767 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4768 tex.src_sel_x = 1; 4769 tex.src_sel_y = 0; 4770 tex.src_sel_z = 3; 4771 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 4772 } 4773 4774 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 4775 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 4776 tex.coord_type_x = 1; 4777 tex.coord_type_y = 1; 4778 } 4779 tex.coord_type_z = 1; 4780 tex.coord_type_w = 1; 4781 4782 tex.offset_x = offset_x; 4783 tex.offset_y = offset_y; 4784 tex.offset_z = offset_z; 4785 4786 /* Put the depth for comparison in W. 4787 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 4788 * Some instructions expect the depth in Z. */ 4789 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 4790 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 4791 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 4792 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 4793 opcode != FETCH_OP_SAMPLE_C_L && 4794 opcode != FETCH_OP_SAMPLE_C_LB) { 4795 tex.src_sel_w = tex.src_sel_z; 4796 } 4797 4798 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 4799 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 4800 if (opcode == FETCH_OP_SAMPLE_C_L || 4801 opcode == FETCH_OP_SAMPLE_C_LB) { 4802 /* the array index is read from Y */ 4803 tex.coord_type_y = 0; 4804 } else { 4805 /* the array index is read from Z */ 4806 tex.coord_type_z = 0; 4807 tex.src_sel_z = tex.src_sel_y; 4808 } 4809 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 4810 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 4811 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4812 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 4813 (ctx->bc->chip_class >= EVERGREEN))) 4814 /* the array index is read from Z */ 4815 tex.coord_type_z = 0; 4816 4817 /* mask unused source components */ 4818 if (opcode == FETCH_OP_SAMPLE) { 4819 switch (inst->Texture.Texture) { 4820 case TGSI_TEXTURE_2D: 4821 case TGSI_TEXTURE_RECT: 4822 tex.src_sel_z = 7; 4823 tex.src_sel_w = 7; 4824 break; 4825 case TGSI_TEXTURE_1D_ARRAY: 4826 tex.src_sel_y = 7; 4827 tex.src_sel_w = 7; 4828 break; 4829 case TGSI_TEXTURE_1D: 4830 tex.src_sel_y = 7; 4831 tex.src_sel_z = 7; 4832 tex.src_sel_w = 7; 4833 break; 4834 } 4835 } 4836 4837 r = r600_bytecode_add_tex(ctx->bc, &tex); 4838 if (r) 4839 return r; 4840 4841 /* add shadow ambient support - gallium doesn't do it yet */ 4842 return 0; 4843} 4844 4845static int tgsi_lrp(struct r600_shader_ctx *ctx) 4846{ 4847 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4848 struct r600_bytecode_alu alu; 4849 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4850 unsigned i; 4851 int r; 4852 4853 /* optimize if it's just an equal balance */ 4854 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 4855 for (i = 0; i < lasti + 1; i++) { 4856 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4857 continue; 4858 4859 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4860 alu.op = ALU_OP2_ADD; 4861 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4862 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4863 alu.omod = 3; 4864 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4865 alu.dst.chan = i; 4866 if (i == lasti) { 4867 alu.last = 1; 4868 } 4869 r = r600_bytecode_add_alu(ctx->bc, &alu); 4870 if (r) 4871 return r; 4872 } 4873 return 0; 4874 } 4875 4876 /* 1 - src0 */ 4877 for (i = 0; i < lasti + 1; i++) { 4878 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4879 continue; 4880 4881 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4882 alu.op = ALU_OP2_ADD; 4883 alu.src[0].sel = V_SQ_ALU_SRC_1; 4884 alu.src[0].chan = 0; 4885 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4886 r600_bytecode_src_toggle_neg(&alu.src[1]); 4887 alu.dst.sel = ctx->temp_reg; 4888 alu.dst.chan = i; 4889 if (i == lasti) { 4890 alu.last = 1; 4891 } 4892 alu.dst.write = 1; 4893 r = r600_bytecode_add_alu(ctx->bc, &alu); 4894 if (r) 4895 return r; 4896 } 4897 4898 /* (1 - src0) * src2 */ 4899 for (i = 0; i < lasti + 1; i++) { 4900 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4901 continue; 4902 4903 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4904 alu.op = ALU_OP2_MUL; 4905 alu.src[0].sel = ctx->temp_reg; 4906 alu.src[0].chan = i; 4907 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4908 alu.dst.sel = ctx->temp_reg; 4909 alu.dst.chan = i; 4910 if (i == lasti) { 4911 alu.last = 1; 4912 } 4913 alu.dst.write = 1; 4914 r = r600_bytecode_add_alu(ctx->bc, &alu); 4915 if (r) 4916 return r; 4917 } 4918 4919 /* src0 * src1 + (1 - src0) * src2 */ 4920 for (i = 0; i < lasti + 1; i++) { 4921 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4922 continue; 4923 4924 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4925 alu.op = ALU_OP3_MULADD; 4926 alu.is_op3 = 1; 4927 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4928 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 4929 alu.src[2].sel = ctx->temp_reg; 4930 alu.src[2].chan = i; 4931 4932 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4933 alu.dst.chan = i; 4934 if (i == lasti) { 4935 alu.last = 1; 4936 } 4937 r = r600_bytecode_add_alu(ctx->bc, &alu); 4938 if (r) 4939 return r; 4940 } 4941 return 0; 4942} 4943 4944static int tgsi_cmp(struct r600_shader_ctx *ctx) 4945{ 4946 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4947 struct r600_bytecode_alu alu; 4948 int i, r; 4949 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4950 4951 for (i = 0; i < lasti + 1; i++) { 4952 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4953 continue; 4954 4955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4956 alu.op = ALU_OP3_CNDGE; 4957 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4958 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4959 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 4960 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4961 alu.dst.chan = i; 4962 alu.dst.write = 1; 4963 alu.is_op3 = 1; 4964 if (i == lasti) 4965 alu.last = 1; 4966 r = r600_bytecode_add_alu(ctx->bc, &alu); 4967 if (r) 4968 return r; 4969 } 4970 return 0; 4971} 4972 4973static int tgsi_ucmp(struct r600_shader_ctx *ctx) 4974{ 4975 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4976 struct r600_bytecode_alu alu; 4977 int i, r; 4978 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4979 4980 for (i = 0; i < lasti + 1; i++) { 4981 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4982 continue; 4983 4984 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4985 alu.op = ALU_OP3_CNDGE_INT; 4986 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4987 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4988 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 4989 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4990 alu.dst.chan = i; 4991 alu.dst.write = 1; 4992 alu.is_op3 = 1; 4993 if (i == lasti) 4994 alu.last = 1; 4995 r = r600_bytecode_add_alu(ctx->bc, &alu); 4996 if (r) 4997 return r; 4998 } 4999 return 0; 5000} 5001 5002static int tgsi_xpd(struct r600_shader_ctx *ctx) 5003{ 5004 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5005 static const unsigned int src0_swizzle[] = {2, 0, 1}; 5006 static const unsigned int src1_swizzle[] = {1, 2, 0}; 5007 struct r600_bytecode_alu alu; 5008 uint32_t use_temp = 0; 5009 int i, r; 5010 5011 if (inst->Dst[0].Register.WriteMask != 0xf) 5012 use_temp = 1; 5013 5014 for (i = 0; i < 4; i++) { 5015 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5016 alu.op = ALU_OP2_MUL; 5017 if (i < 3) { 5018 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 5019 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]); 5020 } else { 5021 alu.src[0].sel = V_SQ_ALU_SRC_0; 5022 alu.src[0].chan = i; 5023 alu.src[1].sel = V_SQ_ALU_SRC_0; 5024 alu.src[1].chan = i; 5025 } 5026 5027 alu.dst.sel = ctx->temp_reg; 5028 alu.dst.chan = i; 5029 alu.dst.write = 1; 5030 5031 if (i == 3) 5032 alu.last = 1; 5033 r = r600_bytecode_add_alu(ctx->bc, &alu); 5034 if (r) 5035 return r; 5036 } 5037 5038 for (i = 0; i < 4; i++) { 5039 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5040 alu.op = ALU_OP3_MULADD; 5041 5042 if (i < 3) { 5043 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]); 5044 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]); 5045 } else { 5046 alu.src[0].sel = V_SQ_ALU_SRC_0; 5047 alu.src[0].chan = i; 5048 alu.src[1].sel = V_SQ_ALU_SRC_0; 5049 alu.src[1].chan = i; 5050 } 5051 5052 alu.src[2].sel = ctx->temp_reg; 5053 alu.src[2].neg = 1; 5054 alu.src[2].chan = i; 5055 5056 if (use_temp) 5057 alu.dst.sel = ctx->temp_reg; 5058 else 5059 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5060 alu.dst.chan = i; 5061 alu.dst.write = 1; 5062 alu.is_op3 = 1; 5063 if (i == 3) 5064 alu.last = 1; 5065 r = r600_bytecode_add_alu(ctx->bc, &alu); 5066 if (r) 5067 return r; 5068 } 5069 if (use_temp) 5070 return tgsi_helper_copy(ctx, inst); 5071 return 0; 5072} 5073 5074static int tgsi_exp(struct r600_shader_ctx *ctx) 5075{ 5076 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5077 struct r600_bytecode_alu alu; 5078 int r; 5079 int i; 5080 5081 /* result.x = 2^floor(src); */ 5082 if (inst->Dst[0].Register.WriteMask & 1) { 5083 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5084 5085 alu.op = ALU_OP1_FLOOR; 5086 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5087 5088 alu.dst.sel = ctx->temp_reg; 5089 alu.dst.chan = 0; 5090 alu.dst.write = 1; 5091 alu.last = 1; 5092 r = r600_bytecode_add_alu(ctx->bc, &alu); 5093 if (r) 5094 return r; 5095 5096 if (ctx->bc->chip_class == CAYMAN) { 5097 for (i = 0; i < 3; i++) { 5098 alu.op = ALU_OP1_EXP_IEEE; 5099 alu.src[0].sel = ctx->temp_reg; 5100 alu.src[0].chan = 0; 5101 5102 alu.dst.sel = ctx->temp_reg; 5103 alu.dst.chan = i; 5104 alu.dst.write = i == 0; 5105 alu.last = i == 2; 5106 r = r600_bytecode_add_alu(ctx->bc, &alu); 5107 if (r) 5108 return r; 5109 } 5110 } else { 5111 alu.op = ALU_OP1_EXP_IEEE; 5112 alu.src[0].sel = ctx->temp_reg; 5113 alu.src[0].chan = 0; 5114 5115 alu.dst.sel = ctx->temp_reg; 5116 alu.dst.chan = 0; 5117 alu.dst.write = 1; 5118 alu.last = 1; 5119 r = r600_bytecode_add_alu(ctx->bc, &alu); 5120 if (r) 5121 return r; 5122 } 5123 } 5124 5125 /* result.y = tmp - floor(tmp); */ 5126 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 5127 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5128 5129 alu.op = ALU_OP1_FRACT; 5130 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5131 5132 alu.dst.sel = ctx->temp_reg; 5133#if 0 5134 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5135 if (r) 5136 return r; 5137#endif 5138 alu.dst.write = 1; 5139 alu.dst.chan = 1; 5140 5141 alu.last = 1; 5142 5143 r = r600_bytecode_add_alu(ctx->bc, &alu); 5144 if (r) 5145 return r; 5146 } 5147 5148 /* result.z = RoughApprox2ToX(tmp);*/ 5149 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 5150 if (ctx->bc->chip_class == CAYMAN) { 5151 for (i = 0; i < 3; i++) { 5152 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5153 alu.op = ALU_OP1_EXP_IEEE; 5154 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5155 5156 alu.dst.sel = ctx->temp_reg; 5157 alu.dst.chan = i; 5158 if (i == 2) { 5159 alu.dst.write = 1; 5160 alu.last = 1; 5161 } 5162 5163 r = r600_bytecode_add_alu(ctx->bc, &alu); 5164 if (r) 5165 return r; 5166 } 5167 } else { 5168 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5169 alu.op = ALU_OP1_EXP_IEEE; 5170 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5171 5172 alu.dst.sel = ctx->temp_reg; 5173 alu.dst.write = 1; 5174 alu.dst.chan = 2; 5175 5176 alu.last = 1; 5177 5178 r = r600_bytecode_add_alu(ctx->bc, &alu); 5179 if (r) 5180 return r; 5181 } 5182 } 5183 5184 /* result.w = 1.0;*/ 5185 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 5186 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5187 5188 alu.op = ALU_OP1_MOV; 5189 alu.src[0].sel = V_SQ_ALU_SRC_1; 5190 alu.src[0].chan = 0; 5191 5192 alu.dst.sel = ctx->temp_reg; 5193 alu.dst.chan = 3; 5194 alu.dst.write = 1; 5195 alu.last = 1; 5196 r = r600_bytecode_add_alu(ctx->bc, &alu); 5197 if (r) 5198 return r; 5199 } 5200 return tgsi_helper_copy(ctx, inst); 5201} 5202 5203static int tgsi_log(struct r600_shader_ctx *ctx) 5204{ 5205 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5206 struct r600_bytecode_alu alu; 5207 int r; 5208 int i; 5209 5210 /* result.x = floor(log2(|src|)); */ 5211 if (inst->Dst[0].Register.WriteMask & 1) { 5212 if (ctx->bc->chip_class == CAYMAN) { 5213 for (i = 0; i < 3; i++) { 5214 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5215 5216 alu.op = ALU_OP1_LOG_IEEE; 5217 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5218 r600_bytecode_src_set_abs(&alu.src[0]); 5219 5220 alu.dst.sel = ctx->temp_reg; 5221 alu.dst.chan = i; 5222 if (i == 0) 5223 alu.dst.write = 1; 5224 if (i == 2) 5225 alu.last = 1; 5226 r = r600_bytecode_add_alu(ctx->bc, &alu); 5227 if (r) 5228 return r; 5229 } 5230 5231 } else { 5232 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5233 5234 alu.op = ALU_OP1_LOG_IEEE; 5235 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5236 r600_bytecode_src_set_abs(&alu.src[0]); 5237 5238 alu.dst.sel = ctx->temp_reg; 5239 alu.dst.chan = 0; 5240 alu.dst.write = 1; 5241 alu.last = 1; 5242 r = r600_bytecode_add_alu(ctx->bc, &alu); 5243 if (r) 5244 return r; 5245 } 5246 5247 alu.op = ALU_OP1_FLOOR; 5248 alu.src[0].sel = ctx->temp_reg; 5249 alu.src[0].chan = 0; 5250 5251 alu.dst.sel = ctx->temp_reg; 5252 alu.dst.chan = 0; 5253 alu.dst.write = 1; 5254 alu.last = 1; 5255 5256 r = r600_bytecode_add_alu(ctx->bc, &alu); 5257 if (r) 5258 return r; 5259 } 5260 5261 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 5262 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 5263 5264 if (ctx->bc->chip_class == CAYMAN) { 5265 for (i = 0; i < 3; i++) { 5266 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5267 5268 alu.op = ALU_OP1_LOG_IEEE; 5269 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5270 r600_bytecode_src_set_abs(&alu.src[0]); 5271 5272 alu.dst.sel = ctx->temp_reg; 5273 alu.dst.chan = i; 5274 if (i == 1) 5275 alu.dst.write = 1; 5276 if (i == 2) 5277 alu.last = 1; 5278 5279 r = r600_bytecode_add_alu(ctx->bc, &alu); 5280 if (r) 5281 return r; 5282 } 5283 } else { 5284 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5285 5286 alu.op = ALU_OP1_LOG_IEEE; 5287 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5288 r600_bytecode_src_set_abs(&alu.src[0]); 5289 5290 alu.dst.sel = ctx->temp_reg; 5291 alu.dst.chan = 1; 5292 alu.dst.write = 1; 5293 alu.last = 1; 5294 5295 r = r600_bytecode_add_alu(ctx->bc, &alu); 5296 if (r) 5297 return r; 5298 } 5299 5300 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5301 5302 alu.op = ALU_OP1_FLOOR; 5303 alu.src[0].sel = ctx->temp_reg; 5304 alu.src[0].chan = 1; 5305 5306 alu.dst.sel = ctx->temp_reg; 5307 alu.dst.chan = 1; 5308 alu.dst.write = 1; 5309 alu.last = 1; 5310 5311 r = r600_bytecode_add_alu(ctx->bc, &alu); 5312 if (r) 5313 return r; 5314 5315 if (ctx->bc->chip_class == CAYMAN) { 5316 for (i = 0; i < 3; i++) { 5317 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5318 alu.op = ALU_OP1_EXP_IEEE; 5319 alu.src[0].sel = ctx->temp_reg; 5320 alu.src[0].chan = 1; 5321 5322 alu.dst.sel = ctx->temp_reg; 5323 alu.dst.chan = i; 5324 if (i == 1) 5325 alu.dst.write = 1; 5326 if (i == 2) 5327 alu.last = 1; 5328 5329 r = r600_bytecode_add_alu(ctx->bc, &alu); 5330 if (r) 5331 return r; 5332 } 5333 } else { 5334 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5335 alu.op = ALU_OP1_EXP_IEEE; 5336 alu.src[0].sel = ctx->temp_reg; 5337 alu.src[0].chan = 1; 5338 5339 alu.dst.sel = ctx->temp_reg; 5340 alu.dst.chan = 1; 5341 alu.dst.write = 1; 5342 alu.last = 1; 5343 5344 r = r600_bytecode_add_alu(ctx->bc, &alu); 5345 if (r) 5346 return r; 5347 } 5348 5349 if (ctx->bc->chip_class == CAYMAN) { 5350 for (i = 0; i < 3; i++) { 5351 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5352 alu.op = ALU_OP1_RECIP_IEEE; 5353 alu.src[0].sel = ctx->temp_reg; 5354 alu.src[0].chan = 1; 5355 5356 alu.dst.sel = ctx->temp_reg; 5357 alu.dst.chan = i; 5358 if (i == 1) 5359 alu.dst.write = 1; 5360 if (i == 2) 5361 alu.last = 1; 5362 5363 r = r600_bytecode_add_alu(ctx->bc, &alu); 5364 if (r) 5365 return r; 5366 } 5367 } else { 5368 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5369 alu.op = ALU_OP1_RECIP_IEEE; 5370 alu.src[0].sel = ctx->temp_reg; 5371 alu.src[0].chan = 1; 5372 5373 alu.dst.sel = ctx->temp_reg; 5374 alu.dst.chan = 1; 5375 alu.dst.write = 1; 5376 alu.last = 1; 5377 5378 r = r600_bytecode_add_alu(ctx->bc, &alu); 5379 if (r) 5380 return r; 5381 } 5382 5383 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5384 5385 alu.op = ALU_OP2_MUL; 5386 5387 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5388 r600_bytecode_src_set_abs(&alu.src[0]); 5389 5390 alu.src[1].sel = ctx->temp_reg; 5391 alu.src[1].chan = 1; 5392 5393 alu.dst.sel = ctx->temp_reg; 5394 alu.dst.chan = 1; 5395 alu.dst.write = 1; 5396 alu.last = 1; 5397 5398 r = r600_bytecode_add_alu(ctx->bc, &alu); 5399 if (r) 5400 return r; 5401 } 5402 5403 /* result.z = log2(|src|);*/ 5404 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 5405 if (ctx->bc->chip_class == CAYMAN) { 5406 for (i = 0; i < 3; i++) { 5407 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5408 5409 alu.op = ALU_OP1_LOG_IEEE; 5410 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5411 r600_bytecode_src_set_abs(&alu.src[0]); 5412 5413 alu.dst.sel = ctx->temp_reg; 5414 if (i == 2) 5415 alu.dst.write = 1; 5416 alu.dst.chan = i; 5417 if (i == 2) 5418 alu.last = 1; 5419 5420 r = r600_bytecode_add_alu(ctx->bc, &alu); 5421 if (r) 5422 return r; 5423 } 5424 } else { 5425 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5426 5427 alu.op = ALU_OP1_LOG_IEEE; 5428 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5429 r600_bytecode_src_set_abs(&alu.src[0]); 5430 5431 alu.dst.sel = ctx->temp_reg; 5432 alu.dst.write = 1; 5433 alu.dst.chan = 2; 5434 alu.last = 1; 5435 5436 r = r600_bytecode_add_alu(ctx->bc, &alu); 5437 if (r) 5438 return r; 5439 } 5440 } 5441 5442 /* result.w = 1.0; */ 5443 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 5444 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5445 5446 alu.op = ALU_OP1_MOV; 5447 alu.src[0].sel = V_SQ_ALU_SRC_1; 5448 alu.src[0].chan = 0; 5449 5450 alu.dst.sel = ctx->temp_reg; 5451 alu.dst.chan = 3; 5452 alu.dst.write = 1; 5453 alu.last = 1; 5454 5455 r = r600_bytecode_add_alu(ctx->bc, &alu); 5456 if (r) 5457 return r; 5458 } 5459 5460 return tgsi_helper_copy(ctx, inst); 5461} 5462 5463static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 5464{ 5465 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5466 struct r600_bytecode_alu alu; 5467 int r; 5468 5469 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5470 5471 switch (inst->Instruction.Opcode) { 5472 case TGSI_OPCODE_ARL: 5473 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 5474 break; 5475 case TGSI_OPCODE_ARR: 5476 alu.op = ALU_OP1_FLT_TO_INT; 5477 break; 5478 case TGSI_OPCODE_UARL: 5479 alu.op = ALU_OP1_MOV; 5480 break; 5481 default: 5482 assert(0); 5483 return -1; 5484 } 5485 5486 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5487 alu.last = 1; 5488 alu.dst.sel = ctx->bc->ar_reg; 5489 alu.dst.write = 1; 5490 r = r600_bytecode_add_alu(ctx->bc, &alu); 5491 if (r) 5492 return r; 5493 5494 ctx->bc->ar_loaded = 0; 5495 return 0; 5496} 5497static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 5498{ 5499 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5500 struct r600_bytecode_alu alu; 5501 int r; 5502 5503 switch (inst->Instruction.Opcode) { 5504 case TGSI_OPCODE_ARL: 5505 memset(&alu, 0, sizeof(alu)); 5506 alu.op = ALU_OP1_FLOOR; 5507 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5508 alu.dst.sel = ctx->bc->ar_reg; 5509 alu.dst.write = 1; 5510 alu.last = 1; 5511 5512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5513 return r; 5514 5515 memset(&alu, 0, sizeof(alu)); 5516 alu.op = ALU_OP1_FLT_TO_INT; 5517 alu.src[0].sel = ctx->bc->ar_reg; 5518 alu.dst.sel = ctx->bc->ar_reg; 5519 alu.dst.write = 1; 5520 alu.last = 1; 5521 5522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5523 return r; 5524 break; 5525 case TGSI_OPCODE_ARR: 5526 memset(&alu, 0, sizeof(alu)); 5527 alu.op = ALU_OP1_FLT_TO_INT; 5528 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5529 alu.dst.sel = ctx->bc->ar_reg; 5530 alu.dst.write = 1; 5531 alu.last = 1; 5532 5533 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5534 return r; 5535 break; 5536 case TGSI_OPCODE_UARL: 5537 memset(&alu, 0, sizeof(alu)); 5538 alu.op = ALU_OP1_MOV; 5539 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5540 alu.dst.sel = ctx->bc->ar_reg; 5541 alu.dst.write = 1; 5542 alu.last = 1; 5543 5544 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5545 return r; 5546 break; 5547 default: 5548 assert(0); 5549 return -1; 5550 } 5551 5552 ctx->bc->ar_loaded = 0; 5553 return 0; 5554} 5555 5556static int tgsi_opdst(struct r600_shader_ctx *ctx) 5557{ 5558 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5559 struct r600_bytecode_alu alu; 5560 int i, r = 0; 5561 5562 for (i = 0; i < 4; i++) { 5563 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5564 5565 alu.op = ALU_OP2_MUL; 5566 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5567 5568 if (i == 0 || i == 3) { 5569 alu.src[0].sel = V_SQ_ALU_SRC_1; 5570 } else { 5571 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5572 } 5573 5574 if (i == 0 || i == 2) { 5575 alu.src[1].sel = V_SQ_ALU_SRC_1; 5576 } else { 5577 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5578 } 5579 if (i == 3) 5580 alu.last = 1; 5581 r = r600_bytecode_add_alu(ctx->bc, &alu); 5582 if (r) 5583 return r; 5584 } 5585 return 0; 5586} 5587 5588static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type) 5589{ 5590 struct r600_bytecode_alu alu; 5591 int r; 5592 5593 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5594 alu.op = opcode; 5595 alu.execute_mask = 1; 5596 alu.update_pred = 1; 5597 5598 alu.dst.sel = ctx->temp_reg; 5599 alu.dst.write = 1; 5600 alu.dst.chan = 0; 5601 5602 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5603 alu.src[1].sel = V_SQ_ALU_SRC_0; 5604 alu.src[1].chan = 0; 5605 5606 alu.last = 1; 5607 5608 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 5609 if (r) 5610 return r; 5611 return 0; 5612} 5613 5614static int pops(struct r600_shader_ctx *ctx, int pops) 5615{ 5616 unsigned force_pop = ctx->bc->force_add_cf; 5617 5618 if (!force_pop) { 5619 int alu_pop = 3; 5620 if (ctx->bc->cf_last) { 5621 if (ctx->bc->cf_last->op == CF_OP_ALU) 5622 alu_pop = 0; 5623 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 5624 alu_pop = 1; 5625 } 5626 alu_pop += pops; 5627 if (alu_pop == 1) { 5628 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 5629 ctx->bc->force_add_cf = 1; 5630 } else if (alu_pop == 2) { 5631 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 5632 ctx->bc->force_add_cf = 1; 5633 } else { 5634 force_pop = 1; 5635 } 5636 } 5637 5638 if (force_pop) { 5639 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 5640 ctx->bc->cf_last->pop_count = pops; 5641 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 5642 } 5643 5644 return 0; 5645} 5646 5647static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx, 5648 unsigned reason) 5649{ 5650 struct r600_stack_info *stack = &ctx->bc->stack; 5651 unsigned elements, entries; 5652 5653 unsigned entry_size = stack->entry_size; 5654 5655 elements = (stack->loop + stack->push_wqm ) * entry_size; 5656 elements += stack->push; 5657 5658 switch (ctx->bc->chip_class) { 5659 case R600: 5660 case R700: 5661 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 5662 * the stack must be reserved to hold the current active/continue 5663 * masks */ 5664 if (reason == FC_PUSH_VPM) { 5665 elements += 2; 5666 } 5667 break; 5668 5669 case CAYMAN: 5670 /* r9xx: any stack operation on empty stack consumes 2 additional 5671 * elements */ 5672 elements += 2; 5673 5674 /* fallthrough */ 5675 /* FIXME: do the two elements added above cover the cases for the 5676 * r8xx+ below? */ 5677 5678 case EVERGREEN: 5679 /* r8xx+: 2 extra elements are not always required, but one extra 5680 * element must be added for each of the following cases: 5681 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 5682 * stack usage. 5683 * (Currently we don't use ALU_ELSE_AFTER.) 5684 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 5685 * PUSH instruction executed. 5686 * 5687 * NOTE: it seems we also need to reserve additional element in some 5688 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 5689 * then STACK_SIZE should be 2 instead of 1 */ 5690 if (reason == FC_PUSH_VPM) { 5691 elements += 1; 5692 } 5693 break; 5694 5695 default: 5696 assert(0); 5697 break; 5698 } 5699 5700 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 5701 * for all chips, so we use 4 in the final formula, not the real entry_size 5702 * for the chip */ 5703 entry_size = 4; 5704 5705 entries = (elements + (entry_size - 1)) / entry_size; 5706 5707 if (entries > stack->max_entries) 5708 stack->max_entries = entries; 5709} 5710 5711static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 5712{ 5713 switch(reason) { 5714 case FC_PUSH_VPM: 5715 --ctx->bc->stack.push; 5716 assert(ctx->bc->stack.push >= 0); 5717 break; 5718 case FC_PUSH_WQM: 5719 --ctx->bc->stack.push_wqm; 5720 assert(ctx->bc->stack.push_wqm >= 0); 5721 break; 5722 case FC_LOOP: 5723 --ctx->bc->stack.loop; 5724 assert(ctx->bc->stack.loop >= 0); 5725 break; 5726 default: 5727 assert(0); 5728 break; 5729 } 5730} 5731 5732static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 5733{ 5734 switch (reason) { 5735 case FC_PUSH_VPM: 5736 ++ctx->bc->stack.push; 5737 break; 5738 case FC_PUSH_WQM: 5739 ++ctx->bc->stack.push_wqm; 5740 case FC_LOOP: 5741 ++ctx->bc->stack.loop; 5742 break; 5743 default: 5744 assert(0); 5745 } 5746 5747 callstack_update_max_depth(ctx, reason); 5748} 5749 5750static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 5751{ 5752 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 5753 5754 sp->mid = realloc((void *)sp->mid, 5755 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 5756 sp->mid[sp->num_mid] = ctx->bc->cf_last; 5757 sp->num_mid++; 5758} 5759 5760static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 5761{ 5762 ctx->bc->fc_sp++; 5763 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 5764 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 5765} 5766 5767static void fc_poplevel(struct r600_shader_ctx *ctx) 5768{ 5769 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp]; 5770 free(sp->mid); 5771 sp->mid = NULL; 5772 sp->num_mid = 0; 5773 sp->start = NULL; 5774 sp->type = 0; 5775 ctx->bc->fc_sp--; 5776} 5777 5778#if 0 5779static int emit_return(struct r600_shader_ctx *ctx) 5780{ 5781 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 5782 return 0; 5783} 5784 5785static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 5786{ 5787 5788 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 5789 ctx->bc->cf_last->pop_count = pops; 5790 /* XXX work out offset */ 5791 return 0; 5792} 5793 5794static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 5795{ 5796 return 0; 5797} 5798 5799static void emit_testflag(struct r600_shader_ctx *ctx) 5800{ 5801 5802} 5803 5804static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 5805{ 5806 emit_testflag(ctx); 5807 emit_jump_to_offset(ctx, 1, 4); 5808 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 5809 pops(ctx, ifidx + 1); 5810 emit_return(ctx); 5811} 5812 5813static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 5814{ 5815 emit_testflag(ctx); 5816 5817 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 5818 ctx->bc->cf_last->pop_count = 1; 5819 5820 fc_set_mid(ctx, fc_sp); 5821 5822 pops(ctx, 1); 5823} 5824#endif 5825 5826static int emit_if(struct r600_shader_ctx *ctx, int opcode) 5827{ 5828 int alu_type = CF_OP_ALU_PUSH_BEFORE; 5829 5830 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 5831 * LOOP_STARTxxx for nested loops may put the branch stack into a state 5832 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 5833 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 5834 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) { 5835 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 5836 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 5837 alu_type = CF_OP_ALU; 5838 } 5839 5840 emit_logic_pred(ctx, opcode, alu_type); 5841 5842 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 5843 5844 fc_pushlevel(ctx, FC_IF); 5845 5846 callstack_push(ctx, FC_PUSH_VPM); 5847 return 0; 5848} 5849 5850static int tgsi_if(struct r600_shader_ctx *ctx) 5851{ 5852 return emit_if(ctx, ALU_OP2_PRED_SETNE); 5853} 5854 5855static int tgsi_uif(struct r600_shader_ctx *ctx) 5856{ 5857 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT); 5858} 5859 5860static int tgsi_else(struct r600_shader_ctx *ctx) 5861{ 5862 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 5863 ctx->bc->cf_last->pop_count = 1; 5864 5865 fc_set_mid(ctx, ctx->bc->fc_sp); 5866 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id; 5867 return 0; 5868} 5869 5870static int tgsi_endif(struct r600_shader_ctx *ctx) 5871{ 5872 pops(ctx, 1); 5873 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) { 5874 R600_ERR("if/endif unbalanced in shader\n"); 5875 return -1; 5876 } 5877 5878 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) { 5879 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2; 5880 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1; 5881 } else { 5882 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2; 5883 } 5884 fc_poplevel(ctx); 5885 5886 callstack_pop(ctx, FC_PUSH_VPM); 5887 return 0; 5888} 5889 5890static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 5891{ 5892 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 5893 * limited to 4096 iterations, like the other LOOP_* instructions. */ 5894 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 5895 5896 fc_pushlevel(ctx, FC_LOOP); 5897 5898 /* check stack depth */ 5899 callstack_push(ctx, FC_LOOP); 5900 return 0; 5901} 5902 5903static int tgsi_endloop(struct r600_shader_ctx *ctx) 5904{ 5905 int i; 5906 5907 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 5908 5909 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) { 5910 R600_ERR("loop/endloop in shader code are not paired.\n"); 5911 return -EINVAL; 5912 } 5913 5914 /* fixup loop pointers - from r600isa 5915 LOOP END points to CF after LOOP START, 5916 LOOP START point to CF after LOOP END 5917 BRK/CONT point to LOOP END CF 5918 */ 5919 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2; 5920 5921 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2; 5922 5923 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) { 5924 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id; 5925 } 5926 /* XXX add LOOPRET support */ 5927 fc_poplevel(ctx); 5928 callstack_pop(ctx, FC_LOOP); 5929 return 0; 5930} 5931 5932static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 5933{ 5934 unsigned int fscp; 5935 5936 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 5937 { 5938 if (FC_LOOP == ctx->bc->fc_stack[fscp].type) 5939 break; 5940 } 5941 5942 if (fscp == 0) { 5943 R600_ERR("Break not inside loop/endloop pair\n"); 5944 return -EINVAL; 5945 } 5946 5947 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 5948 5949 fc_set_mid(ctx, fscp); 5950 5951 return 0; 5952} 5953 5954static int tgsi_umad(struct r600_shader_ctx *ctx) 5955{ 5956 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5957 struct r600_bytecode_alu alu; 5958 int i, j, k, r; 5959 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5960 5961 /* src0 * src1 */ 5962 for (i = 0; i < lasti + 1; i++) { 5963 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5964 continue; 5965 5966 if (ctx->bc->chip_class == CAYMAN) { 5967 for (j = 0 ; j < 4; j++) { 5968 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5969 5970 alu.op = ALU_OP2_MULLO_UINT; 5971 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) { 5972 r600_bytecode_src(&alu.src[k], &ctx->src[k], i); 5973 } 5974 tgsi_dst(ctx, &inst->Dst[0], j, &alu.dst); 5975 alu.dst.sel = ctx->temp_reg; 5976 alu.dst.write = (j == i); 5977 if (j == 3) 5978 alu.last = 1; 5979 r = r600_bytecode_add_alu(ctx->bc, &alu); 5980 if (r) 5981 return r; 5982 } 5983 } else { 5984 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5985 5986 alu.dst.chan = i; 5987 alu.dst.sel = ctx->temp_reg; 5988 alu.dst.write = 1; 5989 5990 alu.op = ALU_OP2_MULLO_UINT; 5991 for (j = 0; j < 2; j++) { 5992 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 5993 } 5994 5995 alu.last = 1; 5996 r = r600_bytecode_add_alu(ctx->bc, &alu); 5997 if (r) 5998 return r; 5999 } 6000 } 6001 6002 6003 for (i = 0; i < lasti + 1; i++) { 6004 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 6005 continue; 6006 6007 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6008 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6009 6010 alu.op = ALU_OP2_ADD_INT; 6011 6012 alu.src[0].sel = ctx->temp_reg; 6013 alu.src[0].chan = i; 6014 6015 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6016 if (i == lasti) { 6017 alu.last = 1; 6018 } 6019 r = r600_bytecode_add_alu(ctx->bc, &alu); 6020 if (r) 6021 return r; 6022 } 6023 return 0; 6024} 6025 6026static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 6027 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_r600_arl}, 6028 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2}, 6029 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit}, 6030 6031 /* XXX: 6032 * For state trackers other than OpenGL, we'll want to use 6033 * _RECIP_IEEE instead. 6034 */ 6035 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate}, 6036 6037 {TGSI_OPCODE_RSQ, 0, ALU_OP0_NOP, tgsi_rsq}, 6038 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp}, 6039 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log}, 6040 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2}, 6041 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2}, 6042 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp}, 6043 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp}, 6044 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst}, 6045 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2}, 6046 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2}, 6047 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap}, 6048 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2}, 6049 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3}, 6050 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2}, 6051 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp}, 6052 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6053 /* gap */ 6054 {20, 0, ALU_OP0_NOP, tgsi_unsupported}, 6055 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported}, 6056 /* gap */ 6057 {22, 0, ALU_OP0_NOP, tgsi_unsupported}, 6058 {23, 0, ALU_OP0_NOP, tgsi_unsupported}, 6059 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2}, 6060 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6061 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2}, 6062 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2}, 6063 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 6064 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 6065 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow}, 6066 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd}, 6067 /* gap */ 6068 {32, 0, ALU_OP0_NOP, tgsi_unsupported}, 6069 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2}, 6070 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6071 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp}, 6072 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig}, 6073 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 6074 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 6075 {TGSI_OPCODE_KILP, 0, ALU_OP2_KILLGT, tgsi_kill}, /* predicated kill */ 6076 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6077 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6078 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6079 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6080 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6081 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2}, 6082 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6083 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2}, 6084 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig}, 6085 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap}, 6086 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2}, 6087 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6088 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6089 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex}, 6090 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6091 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6092 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6093 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6094 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6095 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported}, 6096 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6097 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_r600_arl}, 6098 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6099 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6100 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported}, 6101 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg}, 6102 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp}, 6103 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs}, 6104 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6105 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6106 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported}, 6107 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp}, 6108 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6109 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 6110 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if}, 6111 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif}, 6112 {76, 0, ALU_OP0_NOP, tgsi_unsupported}, 6113 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else}, 6114 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif}, 6115 /* gap */ 6116 {79, 0, ALU_OP0_NOP, tgsi_unsupported}, 6117 {80, 0, ALU_OP0_NOP, tgsi_unsupported}, 6118 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6119 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6120 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2}, 6121 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 6122 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2}, 6123 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2}, 6124 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2_trans}, 6125 /* gap */ 6126 {88, 0, ALU_OP0_NOP, tgsi_unsupported}, 6127 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2}, 6128 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2}, 6129 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod}, 6130 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2}, 6131 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6132 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex}, 6133 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6134 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 6135 {TGSI_OPCODE_EMIT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6136 {TGSI_OPCODE_ENDPRIM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6137 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop}, 6138 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6139 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop}, 6140 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6141 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6142 /* gap */ 6143 {104, 0, ALU_OP0_NOP, tgsi_unsupported}, 6144 {105, 0, ALU_OP0_NOP, tgsi_unsupported}, 6145 {106, 0, ALU_OP0_NOP, tgsi_unsupported}, 6146 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6147 /* gap */ 6148 {108, 0, ALU_OP0_NOP, tgsi_unsupported}, 6149 {109, 0, ALU_OP0_NOP, tgsi_unsupported}, 6150 {110, 0, ALU_OP0_NOP, tgsi_unsupported}, 6151 {111, 0, ALU_OP0_NOP, tgsi_unsupported}, 6152 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported}, 6153 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported}, 6154 /* gap */ 6155 {114, 0, ALU_OP0_NOP, tgsi_unsupported}, 6156 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6157 {TGSI_OPCODE_KIL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 6158 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 6159 /* gap */ 6160 {118, 0, ALU_OP0_NOP, tgsi_unsupported}, 6161 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 6162 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv}, 6163 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2}, 6164 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2}, 6165 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg}, 6166 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2}, 6167 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2_trans}, 6168 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap}, 6169 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 6170 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 6171 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2}, 6172 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv}, 6173 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad}, 6174 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2}, 6175 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2}, 6176 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod}, 6177 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 6178 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2}, 6179 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2}, 6180 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2_trans}, 6181 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 6182 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2_swap}, 6183 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6184 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6185 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6186 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6187 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported}, 6188 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported}, 6189 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported}, 6190 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported}, 6191 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported}, 6192 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported}, 6193 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported}, 6194 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported}, 6195 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported}, 6196 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported}, 6197 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported}, 6198 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported}, 6199 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_r600_arl}, 6200 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp}, 6201 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs}, 6202 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg}, 6203 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6204 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6205 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6206 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6207 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6208 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported}, 6209 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6210 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported}, 6211 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported}, 6212 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6213 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6214 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6215 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6216 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6217 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6218 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6219 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6220 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6221 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6222 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported}, 6223}; 6224 6225static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 6226 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6227 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2}, 6228 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit}, 6229 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 6230 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq}, 6231 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp}, 6232 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log}, 6233 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2}, 6234 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2}, 6235 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp}, 6236 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp}, 6237 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst}, 6238 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2}, 6239 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2}, 6240 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap}, 6241 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2}, 6242 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3}, 6243 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2}, 6244 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp}, 6245 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6246 /* gap */ 6247 {20, 0, ALU_OP0_NOP, tgsi_unsupported}, 6248 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported}, 6249 /* gap */ 6250 {22, 0, ALU_OP0_NOP, tgsi_unsupported}, 6251 {23, 0, ALU_OP0_NOP, tgsi_unsupported}, 6252 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2}, 6253 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6254 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2}, 6255 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2}, 6256 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 6257 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 6258 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow}, 6259 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd}, 6260 /* gap */ 6261 {32, 0, ALU_OP0_NOP, tgsi_unsupported}, 6262 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2}, 6263 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6264 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp}, 6265 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig}, 6266 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 6267 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 6268 {TGSI_OPCODE_KILP, 0, ALU_OP2_KILLGT, tgsi_kill}, /* predicated kill */ 6269 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6270 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6271 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6272 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6273 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6274 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2}, 6275 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6276 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2}, 6277 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig}, 6278 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap}, 6279 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2}, 6280 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6281 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6282 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex}, 6283 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6284 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6285 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6286 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6287 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6288 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported}, 6289 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6290 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6291 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6292 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6293 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported}, 6294 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg}, 6295 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp}, 6296 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs}, 6297 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6298 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6299 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported}, 6300 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp}, 6301 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6302 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 6303 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if}, 6304 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif}, 6305 {76, 0, ALU_OP0_NOP, tgsi_unsupported}, 6306 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else}, 6307 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif}, 6308 /* gap */ 6309 {79, 0, ALU_OP0_NOP, tgsi_unsupported}, 6310 {80, 0, ALU_OP0_NOP, tgsi_unsupported}, 6311 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6312 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6313 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2}, 6314 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 6315 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2}, 6316 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2}, 6317 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2}, 6318 /* gap */ 6319 {88, 0, ALU_OP0_NOP, tgsi_unsupported}, 6320 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2}, 6321 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2}, 6322 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod}, 6323 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2}, 6324 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6325 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex}, 6326 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6327 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 6328 {TGSI_OPCODE_EMIT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6329 {TGSI_OPCODE_ENDPRIM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6330 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop}, 6331 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6332 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop}, 6333 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6334 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6335 /* gap */ 6336 {104, 0, ALU_OP0_NOP, tgsi_unsupported}, 6337 {105, 0, ALU_OP0_NOP, tgsi_unsupported}, 6338 {106, 0, ALU_OP0_NOP, tgsi_unsupported}, 6339 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6340 /* gap */ 6341 {108, 0, ALU_OP0_NOP, tgsi_unsupported}, 6342 {109, 0, ALU_OP0_NOP, tgsi_unsupported}, 6343 {110, 0, ALU_OP0_NOP, tgsi_unsupported}, 6344 {111, 0, ALU_OP0_NOP, tgsi_unsupported}, 6345 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported}, 6346 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported}, 6347 /* gap */ 6348 {114, 0, ALU_OP0_NOP, tgsi_unsupported}, 6349 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6350 {TGSI_OPCODE_KIL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 6351 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 6352 /* gap */ 6353 {118, 0, ALU_OP0_NOP, tgsi_unsupported}, 6354 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_f2i}, 6355 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv}, 6356 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2}, 6357 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2}, 6358 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg}, 6359 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2}, 6360 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2}, 6361 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap}, 6362 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 6363 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 6364 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2}, 6365 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv}, 6366 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad}, 6367 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2}, 6368 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2}, 6369 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod}, 6370 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 6371 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2}, 6372 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2}, 6373 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2}, 6374 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 6375 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2}, 6376 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6377 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6378 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6379 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6380 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported}, 6381 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported}, 6382 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported}, 6383 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported}, 6384 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported}, 6385 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported}, 6386 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported}, 6387 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported}, 6388 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported}, 6389 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported}, 6390 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported}, 6391 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported}, 6392 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl}, 6393 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp}, 6394 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs}, 6395 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg}, 6396 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6397 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6398 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6399 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6400 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6401 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported}, 6402 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6403 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported}, 6404 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported}, 6405 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6406 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6407 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6408 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6409 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6410 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6411 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6412 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6413 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6414 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6415 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported}, 6416}; 6417 6418static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 6419 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6420 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2}, 6421 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit}, 6422 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 6423 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 6424 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp}, 6425 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log}, 6426 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2}, 6427 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2}, 6428 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp}, 6429 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp}, 6430 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst}, 6431 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2}, 6432 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2}, 6433 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap}, 6434 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2}, 6435 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3}, 6436 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2}, 6437 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp}, 6438 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6439 /* gap */ 6440 {20, 0, ALU_OP0_NOP, tgsi_unsupported}, 6441 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported}, 6442 /* gap */ 6443 {22, 0, ALU_OP0_NOP, tgsi_unsupported}, 6444 {23, 0, ALU_OP0_NOP, tgsi_unsupported}, 6445 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2}, 6446 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6447 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2}, 6448 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2}, 6449 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 6450 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 6451 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, cayman_pow}, 6452 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd}, 6453 /* gap */ 6454 {32, 0, ALU_OP0_NOP, tgsi_unsupported}, 6455 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2}, 6456 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6457 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp}, 6458 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, cayman_trig}, 6459 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 6460 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 6461 {TGSI_OPCODE_KILP, 0, ALU_OP2_KILLGT, tgsi_kill}, /* predicated kill */ 6462 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6463 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6464 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6465 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6466 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6467 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2}, 6468 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6469 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2}, 6470 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, cayman_trig}, 6471 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap}, 6472 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2}, 6473 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6474 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6475 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex}, 6476 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6477 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6478 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6479 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6480 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6481 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported}, 6482 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6483 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6484 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6485 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6486 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported}, 6487 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg}, 6488 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp}, 6489 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs}, 6490 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6491 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6492 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported}, 6493 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp}, 6494 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6495 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 6496 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if}, 6497 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif}, 6498 {76, 0, ALU_OP0_NOP, tgsi_unsupported}, 6499 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else}, 6500 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif}, 6501 /* gap */ 6502 {79, 0, ALU_OP0_NOP, tgsi_unsupported}, 6503 {80, 0, ALU_OP0_NOP, tgsi_unsupported}, 6504 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6505 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6506 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2}, 6507 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2}, 6508 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2}, 6509 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2}, 6510 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2}, 6511 /* gap */ 6512 {88, 0, ALU_OP0_NOP, tgsi_unsupported}, 6513 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2}, 6514 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2}, 6515 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod}, 6516 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2}, 6517 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6518 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex}, 6519 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6520 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 6521 {TGSI_OPCODE_EMIT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6522 {TGSI_OPCODE_ENDPRIM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6523 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop}, 6524 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6525 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop}, 6526 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6527 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6528 /* gap */ 6529 {104, 0, ALU_OP0_NOP, tgsi_unsupported}, 6530 {105, 0, ALU_OP0_NOP, tgsi_unsupported}, 6531 {106, 0, ALU_OP0_NOP, tgsi_unsupported}, 6532 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6533 /* gap */ 6534 {108, 0, ALU_OP0_NOP, tgsi_unsupported}, 6535 {109, 0, ALU_OP0_NOP, tgsi_unsupported}, 6536 {110, 0, ALU_OP0_NOP, tgsi_unsupported}, 6537 {111, 0, ALU_OP0_NOP, tgsi_unsupported}, 6538 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported}, 6539 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported}, 6540 /* gap */ 6541 {114, 0, ALU_OP0_NOP, tgsi_unsupported}, 6542 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6543 {TGSI_OPCODE_KIL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 6544 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 6545 /* gap */ 6546 {118, 0, ALU_OP0_NOP, tgsi_unsupported}, 6547 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2}, 6548 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv}, 6549 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2}, 6550 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2}, 6551 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg}, 6552 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2}, 6553 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2}, 6554 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap}, 6555 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2}, 6556 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2}, 6557 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2}, 6558 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv}, 6559 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad}, 6560 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2}, 6561 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2}, 6562 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod}, 6563 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 6564 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2}, 6565 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2}, 6566 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2}, 6567 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 6568 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2}, 6569 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6570 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6571 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6572 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6573 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported}, 6574 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported}, 6575 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported}, 6576 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported}, 6577 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported}, 6578 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported}, 6579 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported}, 6580 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported}, 6581 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported}, 6582 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported}, 6583 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported}, 6584 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported}, 6585 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl}, 6586 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp}, 6587 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs}, 6588 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg}, 6589 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6590 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6591 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6592 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6593 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6594 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported}, 6595 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6596 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported}, 6597 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported}, 6598 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6599 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6600 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6601 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6602 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6603 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6604 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6605 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6606 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6607 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6608 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported}, 6609}; 6610