r600_shader.c revision 461c463bb2cf324f34bf76562f9942ceb1d69dec
1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_llvm.h" 25#include "r600_formats.h" 26#include "r600_opcodes.h" 27#include "r600_shader.h" 28#include "r600d.h" 29 30#include "sb/sb_public.h" 31 32#include "pipe/p_shader_tokens.h" 33#include "tgsi/tgsi_info.h" 34#include "tgsi/tgsi_parse.h" 35#include "tgsi/tgsi_scan.h" 36#include "tgsi/tgsi_dump.h" 37#include "util/u_memory.h" 38#include "util/u_math.h" 39#include <stdio.h> 40#include <errno.h> 41 42/* CAYMAN notes 43Why CAYMAN got loops for lots of instructions is explained here. 44 45-These 8xx t-slot only ops are implemented in all vector slots. 46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 47These 8xx t-slot only opcodes become vector ops, with all four 48slots expecting the arguments on sources a and b. Result is 49broadcast to all channels. 50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT 51These 8xx t-slot only opcodes become vector ops in the z, y, and 52x slots. 53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 55SQRT_IEEE/_64 56SIN/COS 57The w slot may have an independent co-issued operation, or if the 58result is required to be in the w slot, the opcode above may be 59issued in the w slot as well. 60The compiler must issue the source argument to slots z, y, and x 61*/ 62 63static int r600_shader_from_tgsi(struct r600_context *rctx, 64 struct r600_pipe_shader *pipeshader, 65 struct r600_shader_key key); 66 67static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 68 int size, unsigned comp_mask) { 69 70 if (!size) 71 return; 72 73 if (ps->num_arrays == ps->max_arrays) { 74 ps->max_arrays += 64; 75 ps->arrays = realloc(ps->arrays, ps->max_arrays * 76 sizeof(struct r600_shader_array)); 77 } 78 79 int n = ps->num_arrays; 80 ++ps->num_arrays; 81 82 ps->arrays[n].comp_mask = comp_mask; 83 ps->arrays[n].gpr_start = start_gpr; 84 ps->arrays[n].gpr_count = size; 85} 86 87static void r600_dump_streamout(struct pipe_stream_output_info *so) 88{ 89 unsigned i; 90 91 fprintf(stderr, "STREAMOUT\n"); 92 for (i = 0; i < so->num_outputs; i++) { 93 unsigned mask = ((1 << so->output[i].num_components) - 1) << 94 so->output[i].start_component; 95 fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 96 i, so->output[i].output_buffer, 97 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 98 so->output[i].register_index, 99 mask & 1 ? "x" : "", 100 mask & 2 ? "y" : "", 101 mask & 4 ? "z" : "", 102 mask & 8 ? "w" : "", 103 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 104 } 105} 106 107static int store_shader(struct pipe_context *ctx, 108 struct r600_pipe_shader *shader) 109{ 110 struct r600_context *rctx = (struct r600_context *)ctx; 111 uint32_t *ptr, i; 112 113 if (shader->bo == NULL) { 114 shader->bo = (struct r600_resource*) 115 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 116 if (shader->bo == NULL) { 117 return -ENOMEM; 118 } 119 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE); 120 if (R600_BIG_ENDIAN) { 121 for (i = 0; i < shader->shader.bc.ndw; ++i) { 122 ptr[i] = util_bswap32(shader->shader.bc.bytecode[i]); 123 } 124 } else { 125 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 126 } 127 rctx->b.ws->buffer_unmap(shader->bo->cs_buf); 128 } 129 130 return 0; 131} 132 133int r600_pipe_shader_create(struct pipe_context *ctx, 134 struct r600_pipe_shader *shader, 135 struct r600_shader_key key) 136{ 137 struct r600_context *rctx = (struct r600_context *)ctx; 138 struct r600_pipe_shader_selector *sel = shader->selector; 139 int r; 140 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens); 141 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); 142 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 143 unsigned export_shader = key.vs_as_es; 144 145 shader->shader.bc.isa = rctx->isa; 146 147 if (dump) { 148 fprintf(stderr, "--------------------------------------------------------------\n"); 149 tgsi_dump(sel->tokens, 0); 150 151 if (sel->so.num_outputs) { 152 r600_dump_streamout(&sel->so); 153 } 154 } 155 r = r600_shader_from_tgsi(rctx, shader, key); 156 if (r) { 157 R600_ERR("translation from TGSI failed !\n"); 158 return r; 159 } 160 161 /* disable SB for geom shaders - it can't handle the CF_EMIT instructions */ 162 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); 163 164 /* Check if the bytecode has already been built. When using the llvm 165 * backend, r600_shader_from_tgsi() will take care of building the 166 * bytecode. 167 */ 168 if (!shader->shader.bc.bytecode) { 169 r = r600_bytecode_build(&shader->shader.bc); 170 if (r) { 171 R600_ERR("building bytecode failed !\n"); 172 return r; 173 } 174 } 175 176 if (dump && !sb_disasm) { 177 fprintf(stderr, "--------------------------------------------------------------\n"); 178 r600_bytecode_disasm(&shader->shader.bc); 179 fprintf(stderr, "______________________________________________________________\n"); 180 } else if ((dump && sb_disasm) || use_sb) { 181 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 182 dump, use_sb); 183 if (r) { 184 R600_ERR("r600_sb_bytecode_process failed !\n"); 185 return r; 186 } 187 } 188 189 if (shader->gs_copy_shader) { 190 if (dump) { 191 // dump copy shader 192 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc, 193 &shader->gs_copy_shader->shader, dump, 0); 194 if (r) 195 return r; 196 } 197 198 if ((r = store_shader(ctx, shader->gs_copy_shader))) 199 return r; 200 } 201 202 /* Store the shader in a buffer. */ 203 if ((r = store_shader(ctx, shader))) 204 return r; 205 206 /* Build state. */ 207 switch (shader->shader.processor_type) { 208 case TGSI_PROCESSOR_GEOMETRY: 209 if (rctx->b.chip_class >= EVERGREEN) { 210 evergreen_update_gs_state(ctx, shader); 211 evergreen_update_vs_state(ctx, shader->gs_copy_shader); 212 } else { 213 assert(!"not suported yet"); 214 } 215 break; 216 case TGSI_PROCESSOR_VERTEX: 217 if (rctx->b.chip_class >= EVERGREEN) { 218 if (export_shader) 219 evergreen_update_es_state(ctx, shader); 220 else 221 evergreen_update_vs_state(ctx, shader); 222 } else { 223 r600_update_vs_state(ctx, shader); 224 } 225 break; 226 case TGSI_PROCESSOR_FRAGMENT: 227 if (rctx->b.chip_class >= EVERGREEN) { 228 evergreen_update_ps_state(ctx, shader); 229 } else { 230 r600_update_ps_state(ctx, shader); 231 } 232 break; 233 default: 234 return -EINVAL; 235 } 236 return 0; 237} 238 239void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader) 240{ 241 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL); 242 r600_bytecode_clear(&shader->shader.bc); 243 r600_release_command_buffer(&shader->command_buffer); 244} 245 246/* 247 * tgsi -> r600 shader 248 */ 249struct r600_shader_tgsi_instruction; 250 251struct r600_shader_src { 252 unsigned sel; 253 unsigned swizzle[4]; 254 unsigned neg; 255 unsigned abs; 256 unsigned rel; 257 unsigned kc_bank; 258 uint32_t value[4]; 259}; 260 261struct r600_shader_ctx { 262 struct tgsi_shader_info info; 263 struct tgsi_parse_context parse; 264 const struct tgsi_token *tokens; 265 unsigned type; 266 unsigned file_offset[TGSI_FILE_COUNT]; 267 unsigned temp_reg; 268 struct r600_shader_tgsi_instruction *inst_info; 269 struct r600_bytecode *bc; 270 struct r600_shader *shader; 271 struct r600_shader_src src[4]; 272 uint32_t *literals; 273 uint32_t nliterals; 274 uint32_t max_driver_temp_used; 275 boolean use_llvm; 276 /* needed for evergreen interpolation */ 277 boolean input_centroid; 278 boolean input_linear; 279 boolean input_perspective; 280 int num_interp_gpr; 281 int face_gpr; 282 int colors_used; 283 boolean clip_vertex_write; 284 unsigned cv_output; 285 int fragcoord_input; 286 int native_integers; 287 int next_ring_offset; 288 int gs_next_vertex; 289 struct r600_shader *gs_for_vs; 290}; 291 292struct r600_shader_tgsi_instruction { 293 unsigned tgsi_opcode; 294 unsigned is_op3; 295 unsigned op; 296 int (*process)(struct r600_shader_ctx *ctx); 297}; 298 299static int emit_gs_ring_writes(struct r600_shader_ctx *ctx); 300static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 301static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 302static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 303static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 304static int tgsi_else(struct r600_shader_ctx *ctx); 305static int tgsi_endif(struct r600_shader_ctx *ctx); 306static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 307static int tgsi_endloop(struct r600_shader_ctx *ctx); 308static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 309 310static int tgsi_is_supported(struct r600_shader_ctx *ctx) 311{ 312 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 313 int j; 314 315 if (i->Instruction.NumDstRegs > 1) { 316 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 317 return -EINVAL; 318 } 319 if (i->Instruction.Predicate) { 320 R600_ERR("predicate unsupported\n"); 321 return -EINVAL; 322 } 323#if 0 324 if (i->Instruction.Label) { 325 R600_ERR("label unsupported\n"); 326 return -EINVAL; 327 } 328#endif 329 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 330 if (i->Src[j].Register.Dimension) { 331 switch (i->Src[j].Register.File) { 332 case TGSI_FILE_CONSTANT: 333 break; 334 case TGSI_FILE_INPUT: 335 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) 336 break; 337 default: 338 R600_ERR("unsupported src %d (dimension %d)\n", j, 339 i->Src[j].Register.Dimension); 340 return -EINVAL; 341 } 342 } 343 } 344 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 345 if (i->Dst[j].Register.Dimension) { 346 R600_ERR("unsupported dst (dimension)\n"); 347 return -EINVAL; 348 } 349 } 350 return 0; 351} 352 353static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 354 int input) 355{ 356 int ij_index = 0; 357 358 if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) { 359 if (ctx->shader->input[input].centroid) 360 ij_index++; 361 } else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) { 362 /* if we have perspective add one */ 363 if (ctx->input_perspective) { 364 ij_index++; 365 /* if we have perspective centroid */ 366 if (ctx->input_centroid) 367 ij_index++; 368 } 369 if (ctx->shader->input[input].centroid) 370 ij_index++; 371 } 372 373 ctx->shader->input[input].ij_index = ij_index; 374} 375 376static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 377{ 378 int i, r; 379 struct r600_bytecode_alu alu; 380 int gpr = 0, base_chan = 0; 381 int ij_index = ctx->shader->input[input].ij_index; 382 383 /* work out gpr and base_chan from index */ 384 gpr = ij_index / 2; 385 base_chan = (2 * (ij_index % 2)) + 1; 386 387 for (i = 0; i < 8; i++) { 388 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 389 390 if (i < 4) 391 alu.op = ALU_OP2_INTERP_ZW; 392 else 393 alu.op = ALU_OP2_INTERP_XY; 394 395 if ((i > 1) && (i < 6)) { 396 alu.dst.sel = ctx->shader->input[input].gpr; 397 alu.dst.write = 1; 398 } 399 400 alu.dst.chan = i % 4; 401 402 alu.src[0].sel = gpr; 403 alu.src[0].chan = (base_chan - (i % 2)); 404 405 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 406 407 alu.bank_swizzle_force = SQ_ALU_VEC_210; 408 if ((i % 4) == 3) 409 alu.last = 1; 410 r = r600_bytecode_add_alu(ctx->bc, &alu); 411 if (r) 412 return r; 413 } 414 return 0; 415} 416 417static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 418{ 419 int i, r; 420 struct r600_bytecode_alu alu; 421 422 for (i = 0; i < 4; i++) { 423 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 424 425 alu.op = ALU_OP1_INTERP_LOAD_P0; 426 427 alu.dst.sel = ctx->shader->input[input].gpr; 428 alu.dst.write = 1; 429 430 alu.dst.chan = i; 431 432 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 433 alu.src[0].chan = i; 434 435 if (i == 3) 436 alu.last = 1; 437 r = r600_bytecode_add_alu(ctx->bc, &alu); 438 if (r) 439 return r; 440 } 441 return 0; 442} 443 444/* 445 * Special export handling in shaders 446 * 447 * shader export ARRAY_BASE for EXPORT_POS: 448 * 60 is position 449 * 61 is misc vector 450 * 62, 63 are clip distance vectors 451 * 452 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 453 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 454 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 455 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 456 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 457 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 458 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 459 * exclusive from render target index) 460 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 461 * 462 * 463 * shader export ARRAY_BASE for EXPORT_PIXEL: 464 * 0-7 CB targets 465 * 61 computed Z vector 466 * 467 * The use of the values exported in the computed Z vector are controlled 468 * by DB_SHADER_CONTROL: 469 * Z_EXPORT_ENABLE - Z as a float in RED 470 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 471 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 472 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 473 * DB_SOURCE_FORMAT - export control restrictions 474 * 475 */ 476 477 478/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 479static int r600_spi_sid(struct r600_shader_io * io) 480{ 481 int index, name = io->name; 482 483 /* These params are handled differently, they don't need 484 * semantic indices, so we'll use 0 for them. 485 */ 486 if (name == TGSI_SEMANTIC_POSITION || 487 name == TGSI_SEMANTIC_PSIZE || 488 name == TGSI_SEMANTIC_FACE) 489 index = 0; 490 else { 491 if (name == TGSI_SEMANTIC_GENERIC) { 492 /* For generic params simply use sid from tgsi */ 493 index = io->sid; 494 } else { 495 /* For non-generic params - pack name and sid into 8 bits */ 496 index = 0x80 | (name<<3) | (io->sid); 497 } 498 499 /* Make sure that all really used indices have nonzero value, so 500 * we can just compare it to 0 later instead of comparing the name 501 * with different values to detect special cases. */ 502 index++; 503 } 504 505 return index; 506}; 507 508/* turn input into interpolate on EG */ 509static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 510{ 511 int r = 0; 512 513 if (ctx->shader->input[index].spi_sid) { 514 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 515 if (ctx->shader->input[index].interpolate > 0) { 516 evergreen_interp_assign_ij_index(ctx, index); 517 if (!ctx->use_llvm) 518 r = evergreen_interp_alu(ctx, index); 519 } else { 520 if (!ctx->use_llvm) 521 r = evergreen_interp_flat(ctx, index); 522 } 523 } 524 return r; 525} 526 527static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 528{ 529 struct r600_bytecode_alu alu; 530 int i, r; 531 int gpr_front = ctx->shader->input[front].gpr; 532 int gpr_back = ctx->shader->input[back].gpr; 533 534 for (i = 0; i < 4; i++) { 535 memset(&alu, 0, sizeof(alu)); 536 alu.op = ALU_OP3_CNDGT; 537 alu.is_op3 = 1; 538 alu.dst.write = 1; 539 alu.dst.sel = gpr_front; 540 alu.src[0].sel = ctx->face_gpr; 541 alu.src[1].sel = gpr_front; 542 alu.src[2].sel = gpr_back; 543 544 alu.dst.chan = i; 545 alu.src[1].chan = i; 546 alu.src[2].chan = i; 547 alu.last = (i==3); 548 549 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 550 return r; 551 } 552 553 return 0; 554} 555 556static int tgsi_declaration(struct r600_shader_ctx *ctx) 557{ 558 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 559 int r, i, j, count = d->Range.Last - d->Range.First + 1; 560 561 switch (d->Declaration.File) { 562 case TGSI_FILE_INPUT: 563 i = ctx->shader->ninput; 564 assert(i < Elements(ctx->shader->input)); 565 ctx->shader->ninput += count; 566 ctx->shader->input[i].name = d->Semantic.Name; 567 ctx->shader->input[i].sid = d->Semantic.Index; 568 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 569 ctx->shader->input[i].centroid = d->Interp.Centroid; 570 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First; 571 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 572 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 573 switch (ctx->shader->input[i].name) { 574 case TGSI_SEMANTIC_FACE: 575 ctx->face_gpr = ctx->shader->input[i].gpr; 576 break; 577 case TGSI_SEMANTIC_COLOR: 578 ctx->colors_used++; 579 break; 580 case TGSI_SEMANTIC_POSITION: 581 ctx->fragcoord_input = i; 582 break; 583 } 584 if (ctx->bc->chip_class >= EVERGREEN) { 585 if ((r = evergreen_interp_input(ctx, i))) 586 return r; 587 } 588 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { 589 /* FIXME probably skip inputs if they aren't passed in the ring */ 590 ctx->shader->input[i].ring_offset = ctx->next_ring_offset; 591 ctx->next_ring_offset += 16; 592 } 593 for (j = 1; j < count; ++j) { 594 ctx->shader->input[i + j] = ctx->shader->input[i]; 595 ctx->shader->input[i + j].gpr += j; 596 } 597 break; 598 case TGSI_FILE_OUTPUT: 599 i = ctx->shader->noutput++; 600 assert(i < Elements(ctx->shader->output)); 601 ctx->shader->output[i].name = d->Semantic.Name; 602 ctx->shader->output[i].sid = d->Semantic.Index; 603 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First; 604 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 605 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 606 if (ctx->type == TGSI_PROCESSOR_VERTEX || 607 ctx->type == TGSI_PROCESSOR_GEOMETRY) { 608 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 609 switch (d->Semantic.Name) { 610 case TGSI_SEMANTIC_CLIPDIST: 611 ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2); 612 break; 613 case TGSI_SEMANTIC_PSIZE: 614 ctx->shader->vs_out_misc_write = 1; 615 ctx->shader->vs_out_point_size = 1; 616 break; 617 case TGSI_SEMANTIC_CLIPVERTEX: 618 ctx->clip_vertex_write = TRUE; 619 ctx->cv_output = i; 620 break; 621 } 622 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 623 switch (d->Semantic.Name) { 624 case TGSI_SEMANTIC_COLOR: 625 ctx->shader->nr_ps_max_color_exports++; 626 break; 627 } 628 } 629 break; 630 case TGSI_FILE_TEMPORARY: 631 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 632 if (d->Array.ArrayID) { 633 r600_add_gpr_array(ctx->shader, 634 ctx->file_offset[TGSI_FILE_TEMPORARY] + 635 d->Range.First, 636 d->Range.Last - d->Range.First + 1, 0x0F); 637 } 638 } 639 break; 640 641 case TGSI_FILE_CONSTANT: 642 case TGSI_FILE_SAMPLER: 643 case TGSI_FILE_ADDRESS: 644 break; 645 646 case TGSI_FILE_SYSTEM_VALUE: 647 if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 648 if (!ctx->native_integers) { 649 struct r600_bytecode_alu alu; 650 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 651 652 alu.op = ALU_OP1_INT_TO_FLT; 653 alu.src[0].sel = 0; 654 alu.src[0].chan = 3; 655 656 alu.dst.sel = 0; 657 alu.dst.chan = 3; 658 alu.dst.write = 1; 659 alu.last = 1; 660 661 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 662 return r; 663 } 664 break; 665 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 666 break; 667 default: 668 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 669 return -EINVAL; 670 } 671 return 0; 672} 673 674static int r600_get_temp(struct r600_shader_ctx *ctx) 675{ 676 return ctx->temp_reg + ctx->max_driver_temp_used++; 677} 678 679/* 680 * for evergreen we need to scan the shader to find the number of GPRs we need to 681 * reserve for interpolation. 682 * 683 * we need to know if we are going to emit 684 * any centroid inputs 685 * if perspective and linear are required 686*/ 687static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 688{ 689 int i; 690 int num_baryc; 691 692 ctx->input_linear = FALSE; 693 ctx->input_perspective = FALSE; 694 ctx->input_centroid = FALSE; 695 ctx->num_interp_gpr = 1; 696 697 /* any centroid inputs */ 698 for (i = 0; i < ctx->info.num_inputs; i++) { 699 /* skip position/face */ 700 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 701 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE) 702 continue; 703 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR) 704 ctx->input_linear = TRUE; 705 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE) 706 ctx->input_perspective = TRUE; 707 if (ctx->info.input_centroid[i]) 708 ctx->input_centroid = TRUE; 709 } 710 711 num_baryc = 0; 712 /* ignoring sample for now */ 713 if (ctx->input_perspective) 714 num_baryc++; 715 if (ctx->input_linear) 716 num_baryc++; 717 if (ctx->input_centroid) 718 num_baryc *= 2; 719 720 ctx->num_interp_gpr += (num_baryc + 1) >> 1; 721 722 /* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */ 723 return ctx->num_interp_gpr; 724} 725 726static void tgsi_src(struct r600_shader_ctx *ctx, 727 const struct tgsi_full_src_register *tgsi_src, 728 struct r600_shader_src *r600_src) 729{ 730 memset(r600_src, 0, sizeof(*r600_src)); 731 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 732 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 733 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 734 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 735 r600_src->neg = tgsi_src->Register.Negate; 736 r600_src->abs = tgsi_src->Register.Absolute; 737 738 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 739 int index; 740 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 741 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 742 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 743 744 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 745 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg); 746 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 747 return; 748 } 749 index = tgsi_src->Register.Index; 750 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 751 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 752 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 753 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 754 r600_src->swizzle[0] = 3; 755 r600_src->swizzle[1] = 3; 756 r600_src->swizzle[2] = 3; 757 r600_src->swizzle[3] = 3; 758 r600_src->sel = 0; 759 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 760 r600_src->swizzle[0] = 0; 761 r600_src->swizzle[1] = 0; 762 r600_src->swizzle[2] = 0; 763 r600_src->swizzle[3] = 0; 764 r600_src->sel = 0; 765 } 766 } else { 767 if (tgsi_src->Register.Indirect) 768 r600_src->rel = V_SQ_REL_RELATIVE; 769 r600_src->sel = tgsi_src->Register.Index; 770 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 771 } 772 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 773 if (tgsi_src->Register.Dimension) { 774 r600_src->kc_bank = tgsi_src->Dimension.Index; 775 } 776 } 777} 778 779static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int cb_idx, unsigned int offset, unsigned int dst_reg) 780{ 781 struct r600_bytecode_vtx vtx; 782 unsigned int ar_reg; 783 int r; 784 785 if (offset) { 786 struct r600_bytecode_alu alu; 787 788 memset(&alu, 0, sizeof(alu)); 789 790 alu.op = ALU_OP2_ADD_INT; 791 alu.src[0].sel = ctx->bc->ar_reg; 792 793 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 794 alu.src[1].value = offset; 795 796 alu.dst.sel = dst_reg; 797 alu.dst.write = 1; 798 alu.last = 1; 799 800 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 801 return r; 802 803 ar_reg = dst_reg; 804 } else { 805 ar_reg = ctx->bc->ar_reg; 806 } 807 808 memset(&vtx, 0, sizeof(vtx)); 809 vtx.buffer_id = cb_idx; 810 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */ 811 vtx.src_gpr = ar_reg; 812 vtx.mega_fetch_count = 16; 813 vtx.dst_gpr = dst_reg; 814 vtx.dst_sel_x = 0; /* SEL_X */ 815 vtx.dst_sel_y = 1; /* SEL_Y */ 816 vtx.dst_sel_z = 2; /* SEL_Z */ 817 vtx.dst_sel_w = 3; /* SEL_W */ 818 vtx.data_format = FMT_32_32_32_32_FLOAT; 819 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 820 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 821 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 822 vtx.endian = r600_endian_swap(32); 823 824 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 825 return r; 826 827 return 0; 828} 829 830static int fetch_gs_input(struct r600_shader_ctx *ctx, unsigned index, unsigned vtx_id, unsigned int dst_reg) 831{ 832 struct r600_bytecode_vtx vtx; 833 int r; 834 int offset_reg = vtx_id / 3; 835 int offset_chan = vtx_id % 3; 836 837 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, 838 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ 839 840 if (offset_reg == 0 && offset_chan == 2) 841 offset_chan = 3; 842 843 memset(&vtx, 0, sizeof(vtx)); 844 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 845 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */ 846 vtx.src_gpr = offset_reg; 847 vtx.src_sel_x = offset_chan; 848 vtx.offset = index * 16; /*bytes*/ 849 vtx.mega_fetch_count = 16; 850 vtx.dst_gpr = dst_reg; 851 vtx.dst_sel_x = 0; /* SEL_X */ 852 vtx.dst_sel_y = 1; /* SEL_Y */ 853 vtx.dst_sel_z = 2; /* SEL_Z */ 854 vtx.dst_sel_w = 3; /* SEL_W */ 855 vtx.use_const_fields = 1; 856 857 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 858 return r; 859 860 return 0; 861} 862 863static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) 864{ 865 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 866 int i; 867 868 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 869 struct tgsi_full_src_register *src = &inst->Src[i]; 870 871 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) { 872 int treg = r600_get_temp(ctx); 873 int index = src->Register.Index; 874 int vtx_id = src->Dimension.Index; 875 876 fetch_gs_input(ctx, index, vtx_id, treg); 877 ctx->src[i].sel = treg; 878 } 879 } 880 return 0; 881} 882 883static int tgsi_split_constant(struct r600_shader_ctx *ctx) 884{ 885 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 886 struct r600_bytecode_alu alu; 887 int i, j, k, nconst, r; 888 889 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 890 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 891 nconst++; 892 } 893 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 894 } 895 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 896 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 897 continue; 898 } 899 900 if (ctx->src[i].rel) { 901 int treg = r600_get_temp(ctx); 902 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, treg))) 903 return r; 904 905 ctx->src[i].kc_bank = 0; 906 ctx->src[i].sel = treg; 907 ctx->src[i].rel = 0; 908 j--; 909 } else if (j > 0) { 910 int treg = r600_get_temp(ctx); 911 for (k = 0; k < 4; k++) { 912 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 913 alu.op = ALU_OP1_MOV; 914 alu.src[0].sel = ctx->src[i].sel; 915 alu.src[0].chan = k; 916 alu.src[0].rel = ctx->src[i].rel; 917 alu.dst.sel = treg; 918 alu.dst.chan = k; 919 alu.dst.write = 1; 920 if (k == 3) 921 alu.last = 1; 922 r = r600_bytecode_add_alu(ctx->bc, &alu); 923 if (r) 924 return r; 925 } 926 ctx->src[i].sel = treg; 927 ctx->src[i].rel =0; 928 j--; 929 } 930 } 931 return 0; 932} 933 934/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 935static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 936{ 937 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 938 struct r600_bytecode_alu alu; 939 int i, j, k, nliteral, r; 940 941 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 942 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 943 nliteral++; 944 } 945 } 946 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 947 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 948 int treg = r600_get_temp(ctx); 949 for (k = 0; k < 4; k++) { 950 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 951 alu.op = ALU_OP1_MOV; 952 alu.src[0].sel = ctx->src[i].sel; 953 alu.src[0].chan = k; 954 alu.src[0].value = ctx->src[i].value[k]; 955 alu.dst.sel = treg; 956 alu.dst.chan = k; 957 alu.dst.write = 1; 958 if (k == 3) 959 alu.last = 1; 960 r = r600_bytecode_add_alu(ctx->bc, &alu); 961 if (r) 962 return r; 963 } 964 ctx->src[i].sel = treg; 965 j--; 966 } 967 } 968 return 0; 969} 970 971static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 972{ 973 int i, r, count = ctx->shader->ninput; 974 975 for (i = 0; i < count; i++) { 976 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 977 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 978 if (r) 979 return r; 980 } 981 } 982 return 0; 983} 984 985static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so) 986{ 987 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 988 int i, j, r; 989 990 /* Sanity checking. */ 991 if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) { 992 R600_ERR("Too many stream outputs: %d\n", so->num_outputs); 993 r = -EINVAL; 994 goto out_err; 995 } 996 for (i = 0; i < so->num_outputs; i++) { 997 if (so->output[i].output_buffer >= 4) { 998 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 999 so->output[i].output_buffer); 1000 r = -EINVAL; 1001 goto out_err; 1002 } 1003 } 1004 1005 /* Initialize locations where the outputs are stored. */ 1006 for (i = 0; i < so->num_outputs; i++) { 1007 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; 1008 1009 /* Lower outputs with dst_offset < start_component. 1010 * 1011 * We can only output 4D vectors with a write mask, e.g. we can 1012 * only output the W component at offset 3, etc. If we want 1013 * to store Y, Z, or W at buffer offset 0, we need to use MOV 1014 * to move it to X and output X. */ 1015 if (so->output[i].dst_offset < so->output[i].start_component) { 1016 unsigned tmp = r600_get_temp(ctx); 1017 1018 for (j = 0; j < so->output[i].num_components; j++) { 1019 struct r600_bytecode_alu alu; 1020 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1021 alu.op = ALU_OP1_MOV; 1022 alu.src[0].sel = so_gpr[i]; 1023 alu.src[0].chan = so->output[i].start_component + j; 1024 1025 alu.dst.sel = tmp; 1026 alu.dst.chan = j; 1027 alu.dst.write = 1; 1028 if (j == so->output[i].num_components - 1) 1029 alu.last = 1; 1030 r = r600_bytecode_add_alu(ctx->bc, &alu); 1031 if (r) 1032 return r; 1033 } 1034 so->output[i].start_component = 0; 1035 so_gpr[i] = tmp; 1036 } 1037 } 1038 1039 /* Write outputs to buffers. */ 1040 for (i = 0; i < so->num_outputs; i++) { 1041 struct r600_bytecode_output output; 1042 1043 memset(&output, 0, sizeof(struct r600_bytecode_output)); 1044 output.gpr = so_gpr[i]; 1045 output.elem_size = so->output[i].num_components; 1046 output.array_base = so->output[i].dst_offset - so->output[i].start_component; 1047 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 1048 output.burst_count = 1; 1049 /* array_size is an upper limit for the burst_count 1050 * with MEM_STREAM instructions */ 1051 output.array_size = 0xFFF; 1052 output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; 1053 if (ctx->bc->chip_class >= EVERGREEN) { 1054 switch (so->output[i].output_buffer) { 1055 case 0: 1056 output.op = CF_OP_MEM_STREAM0_BUF0; 1057 break; 1058 case 1: 1059 output.op = CF_OP_MEM_STREAM0_BUF1; 1060 break; 1061 case 2: 1062 output.op = CF_OP_MEM_STREAM0_BUF2; 1063 break; 1064 case 3: 1065 output.op = CF_OP_MEM_STREAM0_BUF3; 1066 break; 1067 } 1068 } else { 1069 switch (so->output[i].output_buffer) { 1070 case 0: 1071 output.op = CF_OP_MEM_STREAM0; 1072 break; 1073 case 1: 1074 output.op = CF_OP_MEM_STREAM1; 1075 break; 1076 case 2: 1077 output.op = CF_OP_MEM_STREAM2; 1078 break; 1079 case 3: 1080 output.op = CF_OP_MEM_STREAM3; 1081 break; 1082 } 1083 } 1084 r = r600_bytecode_add_output(ctx->bc, &output); 1085 if (r) 1086 goto out_err; 1087 } 1088 return 0; 1089out_err: 1090 return r; 1091} 1092 1093static int generate_gs_copy_shader(struct r600_context *rctx, 1094 struct r600_pipe_shader *gs) 1095{ 1096 struct r600_shader_ctx ctx = {}; 1097 struct r600_shader *gs_shader = &gs->shader; 1098 struct r600_pipe_shader *cshader; 1099 int ocnt = gs_shader->noutput; 1100 struct r600_bytecode_alu alu; 1101 struct r600_bytecode_vtx vtx; 1102 struct r600_bytecode_output output; 1103 struct r600_bytecode_cf *cf_jump, *cf_pop, 1104 *last_exp_pos = NULL, *last_exp_param = NULL; 1105 int i, next_pos = 60, next_param = 0; 1106 1107 cshader = calloc(1, sizeof(struct r600_pipe_shader)); 1108 if (!cshader) 1109 return 0; 1110 1111 memcpy(cshader->shader.output, gs_shader->output, ocnt * 1112 sizeof(struct r600_shader_io)); 1113 1114 cshader->shader.noutput = ocnt; 1115 1116 ctx.shader = &cshader->shader; 1117 ctx.bc = &ctx.shader->bc; 1118 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX; 1119 1120 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family, 1121 rctx->screen->has_compressed_msaa_texturing); 1122 1123 ctx.bc->isa = rctx->isa; 1124 1125 /* R0.x = R0.x & 0x3fffffff */ 1126 memset(&alu, 0, sizeof(alu)); 1127 alu.op = ALU_OP2_AND_INT; 1128 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1129 alu.src[1].value = 0x3fffffff; 1130 alu.dst.write = 1; 1131 r600_bytecode_add_alu(ctx.bc, &alu); 1132 1133 /* R0.y = R0.x >> 30 */ 1134 memset(&alu, 0, sizeof(alu)); 1135 alu.op = ALU_OP2_LSHR_INT; 1136 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1137 alu.src[1].value = 0x1e; 1138 alu.dst.chan = 1; 1139 alu.dst.write = 1; 1140 alu.last = 1; 1141 r600_bytecode_add_alu(ctx.bc, &alu); 1142 1143 /* PRED_SETE_INT __, R0.y, 0 */ 1144 memset(&alu, 0, sizeof(alu)); 1145 alu.op = ALU_OP2_PRED_SETE_INT; 1146 alu.src[0].chan = 1; 1147 alu.src[1].sel = V_SQ_ALU_SRC_0; 1148 alu.execute_mask = 1; 1149 alu.update_pred = 1; 1150 alu.last = 1; 1151 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); 1152 1153 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); 1154 cf_jump = ctx.bc->cf_last; 1155 1156 /* fetch vertex data from GSVS ring */ 1157 for (i = 0; i < ocnt; ++i) { 1158 struct r600_shader_io *out = &ctx.shader->output[i]; 1159 out->gpr = i + 1; 1160 out->ring_offset = i * 16; 1161 1162 memset(&vtx, 0, sizeof(vtx)); 1163 vtx.op = FETCH_OP_VFETCH; 1164 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 1165 vtx.fetch_type = 2; 1166 vtx.offset = out->ring_offset; 1167 vtx.dst_gpr = out->gpr; 1168 vtx.dst_sel_x = 0; 1169 vtx.dst_sel_y = 1; 1170 vtx.dst_sel_z = 2; 1171 vtx.dst_sel_w = 3; 1172 vtx.use_const_fields = 1; 1173 1174 r600_bytecode_add_vtx(ctx.bc, &vtx); 1175 } 1176 1177 /* XXX handle clipvertex, streamout? */ 1178 1179 /* export vertex data */ 1180 /* XXX factor out common code with r600_shader_from_tgsi ? */ 1181 for (i = 0; i < ocnt; ++i) { 1182 struct r600_shader_io *out = &ctx.shader->output[i]; 1183 1184 if (out->name == TGSI_SEMANTIC_CLIPVERTEX) 1185 continue; 1186 1187 memset(&output, 0, sizeof(output)); 1188 output.gpr = out->gpr; 1189 output.elem_size = 3; 1190 output.swizzle_x = 0; 1191 output.swizzle_y = 1; 1192 output.swizzle_z = 2; 1193 output.swizzle_w = 3; 1194 output.burst_count = 1; 1195 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1196 output.op = CF_OP_EXPORT; 1197 switch (out->name) { 1198 case TGSI_SEMANTIC_POSITION: 1199 output.array_base = next_pos++; 1200 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1201 break; 1202 1203 case TGSI_SEMANTIC_PSIZE: 1204 output.array_base = next_pos++; 1205 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1206 break; 1207 case TGSI_SEMANTIC_CLIPDIST: 1208 /* spi_sid is 0 for clipdistance outputs that were generated 1209 * for clipvertex - we don't need to pass them to PS */ 1210 if (out->spi_sid) { 1211 /* duplicate it as PARAM to pass to the pixel shader */ 1212 output.array_base = next_param++; 1213 r600_bytecode_add_output(ctx.bc, &output); 1214 last_exp_param = ctx.bc->cf_last; 1215 } 1216 output.array_base = next_pos++; 1217 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1218 break; 1219 case TGSI_SEMANTIC_FOG: 1220 output.swizzle_y = 4; /* 0 */ 1221 output.swizzle_z = 4; /* 0 */ 1222 output.swizzle_w = 5; /* 1 */ 1223 break; 1224 } 1225 r600_bytecode_add_output(ctx.bc, &output); 1226 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) 1227 last_exp_param = ctx.bc->cf_last; 1228 else 1229 last_exp_pos = ctx.bc->cf_last; 1230 } 1231 1232 if (!last_exp_pos) { 1233 memset(&output, 0, sizeof(output)); 1234 output.gpr = 0; 1235 output.elem_size = 3; 1236 output.swizzle_x = 7; 1237 output.swizzle_y = 7; 1238 output.swizzle_z = 7; 1239 output.swizzle_w = 7; 1240 output.burst_count = 1; 1241 output.type = 2; 1242 output.op = CF_OP_EXPORT; 1243 output.array_base = next_pos++; 1244 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1245 r600_bytecode_add_output(ctx.bc, &output); 1246 last_exp_pos = ctx.bc->cf_last; 1247 } 1248 1249 if (!last_exp_param) { 1250 memset(&output, 0, sizeof(output)); 1251 output.gpr = 0; 1252 output.elem_size = 3; 1253 output.swizzle_x = 7; 1254 output.swizzle_y = 7; 1255 output.swizzle_z = 7; 1256 output.swizzle_w = 7; 1257 output.burst_count = 1; 1258 output.type = 2; 1259 output.op = CF_OP_EXPORT; 1260 output.array_base = next_param++; 1261 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1262 r600_bytecode_add_output(ctx.bc, &output); 1263 last_exp_param = ctx.bc->cf_last; 1264 } 1265 1266 last_exp_pos->op = CF_OP_EXPORT_DONE; 1267 last_exp_param->op = CF_OP_EXPORT_DONE; 1268 1269 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 1270 cf_pop = ctx.bc->cf_last; 1271 1272 cf_jump->cf_addr = cf_pop->id + 2; 1273 cf_jump->pop_count = 1; 1274 cf_pop->cf_addr = cf_pop->id + 2; 1275 cf_pop->pop_count = 1; 1276 1277 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 1278 ctx.bc->cf_last->end_of_program = 1; 1279 1280 gs->gs_copy_shader = cshader; 1281 1282 ctx.bc->nstack = 1; 1283 cshader->shader.ring_item_size = ocnt * 16; 1284 1285 return r600_bytecode_build(ctx.bc); 1286} 1287 1288static int emit_gs_ring_writes(struct r600_shader_ctx *ctx) 1289{ 1290 struct r600_bytecode_output output; 1291 int i, k, ring_offset; 1292 1293 for (i = 0; i < ctx->shader->noutput; i++) { 1294 if (ctx->gs_for_vs) { 1295 /* for ES we need to lookup corresponding ring offset expected by GS 1296 * (map this output to GS input by name and sid) */ 1297 /* FIXME precompute offsets */ 1298 ring_offset = -1; 1299 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) { 1300 struct r600_shader_io *in = &ctx->gs_for_vs->input[k]; 1301 struct r600_shader_io *out = &ctx->shader->output[i]; 1302 if (in->name == out->name && in->sid == out->sid) 1303 ring_offset = in->ring_offset; 1304 } 1305 1306 if (ring_offset == -1) 1307 continue; 1308 } else 1309 ring_offset = i * 16; 1310 1311 /* next_ring_offset after parsing input decls contains total size of 1312 * single vertex data, gs_next_vertex - current vertex index */ 1313 ring_offset += ctx->next_ring_offset * ctx->gs_next_vertex; 1314 1315 memset(&output, 0, sizeof(struct r600_bytecode_output)); 1316 output.gpr = ctx->shader->output[i].gpr; 1317 output.elem_size = 3; 1318 output.comp_mask = 0xF; 1319 output.burst_count = 1; 1320 output.op = CF_OP_MEM_RING; 1321 output.array_base = ring_offset >> 2; /* in dwords */ 1322 r600_bytecode_add_output(ctx->bc, &output); 1323 } 1324 ++ctx->gs_next_vertex; 1325 return 0; 1326} 1327 1328static int r600_shader_from_tgsi(struct r600_context *rctx, 1329 struct r600_pipe_shader *pipeshader, 1330 struct r600_shader_key key) 1331{ 1332 struct r600_screen *rscreen = rctx->screen; 1333 struct r600_shader *shader = &pipeshader->shader; 1334 struct tgsi_token *tokens = pipeshader->selector->tokens; 1335 struct pipe_stream_output_info so = pipeshader->selector->so; 1336 struct tgsi_full_immediate *immediate; 1337 struct tgsi_full_property *property; 1338 struct r600_shader_ctx ctx; 1339 struct r600_bytecode_output output[32]; 1340 unsigned output_done, noutput; 1341 unsigned opcode; 1342 int i, j, k, r = 0; 1343 int next_pos_base = 60, next_param_base = 0; 1344 int max_color_exports = MAX2(key.nr_cbufs, 1); 1345 /* Declarations used by llvm code */ 1346 bool use_llvm = false; 1347 bool indirect_gprs; 1348 bool ring_outputs = false; 1349 1350#ifdef R600_USE_LLVM 1351 use_llvm = !(rscreen->b.debug_flags & DBG_NO_LLVM); 1352#endif 1353 ctx.bc = &shader->bc; 1354 ctx.shader = shader; 1355 ctx.native_integers = true; 1356 1357 shader->vs_as_es = key.vs_as_es; 1358 1359 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, 1360 rscreen->has_compressed_msaa_texturing); 1361 ctx.tokens = tokens; 1362 tgsi_scan_shader(tokens, &ctx.info); 1363 shader->indirect_files = ctx.info.indirect_files; 1364 indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT); 1365 tgsi_parse_init(&ctx.parse, tokens); 1366 ctx.type = ctx.parse.FullHeader.Processor.Processor; 1367 shader->processor_type = ctx.type; 1368 ctx.bc->type = shader->processor_type; 1369 1370 ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY); 1371 1372 if (key.vs_as_es) { 1373 ctx.gs_for_vs = &rctx->gs_shader->current->shader; 1374 } else { 1375 ctx.gs_for_vs = NULL; 1376 } 1377 1378 ctx.next_ring_offset = 0; 1379 ctx.gs_next_vertex = 0; 1380 1381 ctx.face_gpr = -1; 1382 ctx.fragcoord_input = -1; 1383 ctx.colors_used = 0; 1384 ctx.clip_vertex_write = 0; 1385 1386 shader->nr_ps_color_exports = 0; 1387 shader->nr_ps_max_color_exports = 0; 1388 1389 shader->two_side = key.color_two_side; 1390 1391 /* register allocations */ 1392 /* Values [0,127] correspond to GPR[0..127]. 1393 * Values [128,159] correspond to constant buffer bank 0 1394 * Values [160,191] correspond to constant buffer bank 1 1395 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 1396 * Values [256,287] correspond to constant buffer bank 2 (EG) 1397 * Values [288,319] correspond to constant buffer bank 3 (EG) 1398 * Other special values are shown in the list below. 1399 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 1400 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 1401 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 1402 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 1403 * 248 SQ_ALU_SRC_0: special constant 0.0. 1404 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 1405 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 1406 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 1407 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 1408 * 253 SQ_ALU_SRC_LITERAL: literal constant. 1409 * 254 SQ_ALU_SRC_PV: previous vector result. 1410 * 255 SQ_ALU_SRC_PS: previous scalar result. 1411 */ 1412 for (i = 0; i < TGSI_FILE_COUNT; i++) { 1413 ctx.file_offset[i] = 0; 1414 } 1415 1416#ifdef R600_USE_LLVM 1417 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) { 1418 fprintf(stderr, "Warning: R600 LLVM backend does not support " 1419 "indirect adressing. Falling back to TGSI " 1420 "backend.\n"); 1421 use_llvm = 0; 1422 } 1423#endif 1424 if (ctx.type == TGSI_PROCESSOR_VERTEX) { 1425 ctx.file_offset[TGSI_FILE_INPUT] = 1; 1426 if (!use_llvm) { 1427 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 1428 } 1429 } 1430 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) { 1431 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 1432 } 1433 if (ctx.type == TGSI_PROCESSOR_GEOMETRY && ctx.bc->chip_class >= EVERGREEN) { 1434 /* FIXME 1 would be enough in some cases (3 or less input vertices) */ 1435 ctx.file_offset[TGSI_FILE_INPUT] = 2; 1436 } 1437 ctx.use_llvm = use_llvm; 1438 1439 if (use_llvm) { 1440 ctx.file_offset[TGSI_FILE_OUTPUT] = 1441 ctx.file_offset[TGSI_FILE_INPUT]; 1442 } else { 1443 ctx.file_offset[TGSI_FILE_OUTPUT] = 1444 ctx.file_offset[TGSI_FILE_INPUT] + 1445 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 1446 } 1447 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 1448 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 1449 1450 /* Outside the GPR range. This will be translated to one of the 1451 * kcache banks later. */ 1452 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 1453 1454 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 1455 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + 1456 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; 1457 ctx.temp_reg = ctx.bc->ar_reg + 1; 1458 1459 if (indirect_gprs) { 1460 shader->max_arrays = 0; 1461 shader->num_arrays = 0; 1462 1463 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 1464 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 1465 ctx.file_offset[TGSI_FILE_OUTPUT] - 1466 ctx.file_offset[TGSI_FILE_INPUT], 1467 0x0F); 1468 } 1469 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 1470 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 1471 ctx.file_offset[TGSI_FILE_TEMPORARY] - 1472 ctx.file_offset[TGSI_FILE_OUTPUT], 1473 0x0F); 1474 } 1475 } 1476 1477 ctx.nliterals = 0; 1478 ctx.literals = NULL; 1479 shader->fs_write_all = FALSE; 1480 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 1481 tgsi_parse_token(&ctx.parse); 1482 switch (ctx.parse.FullToken.Token.Type) { 1483 case TGSI_TOKEN_TYPE_IMMEDIATE: 1484 immediate = &ctx.parse.FullToken.FullImmediate; 1485 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 1486 if(ctx.literals == NULL) { 1487 r = -ENOMEM; 1488 goto out_err; 1489 } 1490 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 1491 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 1492 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 1493 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 1494 ctx.nliterals++; 1495 break; 1496 case TGSI_TOKEN_TYPE_DECLARATION: 1497 r = tgsi_declaration(&ctx); 1498 if (r) 1499 goto out_err; 1500 break; 1501 case TGSI_TOKEN_TYPE_INSTRUCTION: 1502 break; 1503 case TGSI_TOKEN_TYPE_PROPERTY: 1504 property = &ctx.parse.FullToken.FullProperty; 1505 switch (property->Property.PropertyName) { 1506 case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: 1507 if (property->u[0].Data == 1) 1508 shader->fs_write_all = TRUE; 1509 break; 1510 case TGSI_PROPERTY_VS_PROHIBIT_UCPS: 1511 /* we don't need this one */ 1512 break; 1513 case TGSI_PROPERTY_GS_INPUT_PRIM: 1514 shader->gs_input_prim = property->u[0].Data; 1515 break; 1516 case TGSI_PROPERTY_GS_OUTPUT_PRIM: 1517 shader->gs_output_prim = property->u[0].Data; 1518 break; 1519 case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: 1520 shader->gs_max_out_vertices = property->u[0].Data; 1521 break; 1522 } 1523 break; 1524 default: 1525 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 1526 r = -EINVAL; 1527 goto out_err; 1528 } 1529 } 1530 1531 shader->ring_item_size = ctx.next_ring_offset; 1532 1533 /* Process two side if needed */ 1534 if (shader->two_side && ctx.colors_used) { 1535 int i, count = ctx.shader->ninput; 1536 unsigned next_lds_loc = ctx.shader->nlds; 1537 1538 /* additional inputs will be allocated right after the existing inputs, 1539 * we won't need them after the color selection, so we don't need to 1540 * reserve these gprs for the rest of the shader code and to adjust 1541 * output offsets etc. */ 1542 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 1543 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 1544 1545 if (ctx.face_gpr == -1) { 1546 i = ctx.shader->ninput++; 1547 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 1548 ctx.shader->input[i].spi_sid = 0; 1549 ctx.shader->input[i].gpr = gpr++; 1550 ctx.face_gpr = ctx.shader->input[i].gpr; 1551 } 1552 1553 for (i = 0; i < count; i++) { 1554 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 1555 int ni = ctx.shader->ninput++; 1556 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 1557 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 1558 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 1559 ctx.shader->input[ni].gpr = gpr++; 1560 // TGSI to LLVM needs to know the lds position of inputs. 1561 // Non LLVM path computes it later (in process_twoside_color) 1562 ctx.shader->input[ni].lds_pos = next_lds_loc++; 1563 ctx.shader->input[i].back_color_input = ni; 1564 if (ctx.bc->chip_class >= EVERGREEN) { 1565 if ((r = evergreen_interp_input(&ctx, ni))) 1566 return r; 1567 } 1568 } 1569 } 1570 } 1571 1572/* LLVM backend setup */ 1573#ifdef R600_USE_LLVM 1574 if (use_llvm) { 1575 struct radeon_llvm_context radeon_llvm_ctx; 1576 LLVMModuleRef mod; 1577 bool dump = r600_can_dump_shader(&rscreen->b, tokens); 1578 boolean use_kill = false; 1579 1580 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx)); 1581 radeon_llvm_ctx.type = ctx.type; 1582 radeon_llvm_ctx.two_side = shader->two_side; 1583 radeon_llvm_ctx.face_gpr = ctx.face_gpr; 1584 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1; 1585 radeon_llvm_ctx.r600_inputs = ctx.shader->input; 1586 radeon_llvm_ctx.r600_outputs = ctx.shader->output; 1587 radeon_llvm_ctx.color_buffer_count = max_color_exports; 1588 radeon_llvm_ctx.chip_class = ctx.bc->chip_class; 1589 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN); 1590 radeon_llvm_ctx.stream_outputs = &so; 1591 radeon_llvm_ctx.clip_vertex = ctx.cv_output; 1592 radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; 1593 radeon_llvm_ctx.has_compressed_msaa_texturing = 1594 ctx.bc->has_compressed_msaa_texturing; 1595 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens); 1596 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; 1597 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers; 1598 1599 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) { 1600 radeon_llvm_dispose(&radeon_llvm_ctx); 1601 use_llvm = 0; 1602 fprintf(stderr, "R600 LLVM backend failed to compile " 1603 "shader. Falling back to TGSI\n"); 1604 } else { 1605 ctx.file_offset[TGSI_FILE_OUTPUT] = 1606 ctx.file_offset[TGSI_FILE_INPUT]; 1607 } 1608 if (use_kill) 1609 ctx.shader->uses_kill = use_kill; 1610 radeon_llvm_dispose(&radeon_llvm_ctx); 1611 } 1612#endif 1613/* End of LLVM backend setup */ 1614 1615 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) 1616 shader->nr_ps_max_color_exports = 8; 1617 1618 if (!use_llvm) { 1619 if (ctx.fragcoord_input >= 0) { 1620 if (ctx.bc->chip_class == CAYMAN) { 1621 for (j = 0 ; j < 4; j++) { 1622 struct r600_bytecode_alu alu; 1623 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1624 alu.op = ALU_OP1_RECIP_IEEE; 1625 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 1626 alu.src[0].chan = 3; 1627 1628 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 1629 alu.dst.chan = j; 1630 alu.dst.write = (j == 3); 1631 alu.last = 1; 1632 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 1633 return r; 1634 } 1635 } else { 1636 struct r600_bytecode_alu alu; 1637 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1638 alu.op = ALU_OP1_RECIP_IEEE; 1639 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 1640 alu.src[0].chan = 3; 1641 1642 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 1643 alu.dst.chan = 3; 1644 alu.dst.write = 1; 1645 alu.last = 1; 1646 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 1647 return r; 1648 } 1649 } 1650 1651 if (shader->two_side && ctx.colors_used) { 1652 if ((r = process_twoside_color_inputs(&ctx))) 1653 return r; 1654 } 1655 1656 tgsi_parse_init(&ctx.parse, tokens); 1657 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 1658 tgsi_parse_token(&ctx.parse); 1659 switch (ctx.parse.FullToken.Token.Type) { 1660 case TGSI_TOKEN_TYPE_INSTRUCTION: 1661 r = tgsi_is_supported(&ctx); 1662 if (r) 1663 goto out_err; 1664 ctx.max_driver_temp_used = 0; 1665 /* reserve first tmp for everyone */ 1666 r600_get_temp(&ctx); 1667 1668 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 1669 if ((r = tgsi_split_constant(&ctx))) 1670 goto out_err; 1671 if ((r = tgsi_split_literal_constant(&ctx))) 1672 goto out_err; 1673 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) 1674 if ((r = tgsi_split_gs_inputs(&ctx))) 1675 goto out_err; 1676 if (ctx.bc->chip_class == CAYMAN) 1677 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 1678 else if (ctx.bc->chip_class >= EVERGREEN) 1679 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 1680 else 1681 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 1682 r = ctx.inst_info->process(&ctx); 1683 if (r) 1684 goto out_err; 1685 break; 1686 default: 1687 break; 1688 } 1689 } 1690 } 1691 1692 /* Reset the temporary register counter. */ 1693 ctx.max_driver_temp_used = 0; 1694 1695 noutput = shader->noutput; 1696 1697 if (!ring_outputs && ctx.clip_vertex_write) { 1698 unsigned clipdist_temp[2]; 1699 1700 clipdist_temp[0] = r600_get_temp(&ctx); 1701 clipdist_temp[1] = r600_get_temp(&ctx); 1702 1703 /* need to convert a clipvertex write into clipdistance writes and not export 1704 the clip vertex anymore */ 1705 1706 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 1707 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 1708 shader->output[noutput].gpr = clipdist_temp[0]; 1709 noutput++; 1710 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 1711 shader->output[noutput].gpr = clipdist_temp[1]; 1712 noutput++; 1713 1714 /* reset spi_sid for clipvertex output to avoid confusing spi */ 1715 shader->output[ctx.cv_output].spi_sid = 0; 1716 1717 shader->clip_dist_write = 0xFF; 1718 1719 for (i = 0; i < 8; i++) { 1720 int oreg = i >> 2; 1721 int ochan = i & 3; 1722 1723 for (j = 0; j < 4; j++) { 1724 struct r600_bytecode_alu alu; 1725 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1726 alu.op = ALU_OP2_DOT4; 1727 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 1728 alu.src[0].chan = j; 1729 1730 alu.src[1].sel = 512 + i; 1731 alu.src[1].kc_bank = R600_UCP_CONST_BUFFER; 1732 alu.src[1].chan = j; 1733 1734 alu.dst.sel = clipdist_temp[oreg]; 1735 alu.dst.chan = j; 1736 alu.dst.write = (j == ochan); 1737 if (j == 3) 1738 alu.last = 1; 1739 if (!use_llvm) 1740 r = r600_bytecode_add_alu(ctx.bc, &alu); 1741 if (r) 1742 return r; 1743 } 1744 } 1745 } 1746 1747 /* Add stream outputs. */ 1748 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX && 1749 so.num_outputs && !use_llvm) 1750 emit_streamout(&ctx, &so); 1751 1752 if (ring_outputs) { 1753 if (key.vs_as_es) 1754 emit_gs_ring_writes(&ctx); 1755 } else { 1756 /* export output */ 1757 for (i = 0, j = 0; i < noutput; i++, j++) { 1758 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1759 output[j].gpr = shader->output[i].gpr; 1760 output[j].elem_size = 3; 1761 output[j].swizzle_x = 0; 1762 output[j].swizzle_y = 1; 1763 output[j].swizzle_z = 2; 1764 output[j].swizzle_w = 3; 1765 output[j].burst_count = 1; 1766 output[j].type = -1; 1767 output[j].op = CF_OP_EXPORT; 1768 switch (ctx.type) { 1769 case TGSI_PROCESSOR_VERTEX: 1770 switch (shader->output[i].name) { 1771 case TGSI_SEMANTIC_POSITION: 1772 output[j].array_base = next_pos_base++; 1773 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1774 break; 1775 1776 case TGSI_SEMANTIC_PSIZE: 1777 output[j].array_base = next_pos_base++; 1778 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1779 break; 1780 case TGSI_SEMANTIC_CLIPVERTEX: 1781 j--; 1782 break; 1783 case TGSI_SEMANTIC_CLIPDIST: 1784 output[j].array_base = next_pos_base++; 1785 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1786 /* spi_sid is 0 for clipdistance outputs that were generated 1787 * for clipvertex - we don't need to pass them to PS */ 1788 if (shader->output[i].spi_sid) { 1789 j++; 1790 /* duplicate it as PARAM to pass to the pixel shader */ 1791 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 1792 output[j].array_base = next_param_base++; 1793 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1794 } 1795 break; 1796 case TGSI_SEMANTIC_FOG: 1797 output[j].swizzle_y = 4; /* 0 */ 1798 output[j].swizzle_z = 4; /* 0 */ 1799 output[j].swizzle_w = 5; /* 1 */ 1800 break; 1801 } 1802 break; 1803 case TGSI_PROCESSOR_FRAGMENT: 1804 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 1805 /* never export more colors than the number of CBs */ 1806 if (shader->output[i].sid >= max_color_exports) { 1807 /* skip export */ 1808 j--; 1809 continue; 1810 } 1811 output[j].swizzle_w = key.alpha_to_one ? 5 : 3; 1812 output[j].array_base = shader->output[i].sid; 1813 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1814 shader->nr_ps_color_exports++; 1815 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) { 1816 for (k = 1; k < max_color_exports; k++) { 1817 j++; 1818 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1819 output[j].gpr = shader->output[i].gpr; 1820 output[j].elem_size = 3; 1821 output[j].swizzle_x = 0; 1822 output[j].swizzle_y = 1; 1823 output[j].swizzle_z = 2; 1824 output[j].swizzle_w = key.alpha_to_one ? 5 : 3; 1825 output[j].burst_count = 1; 1826 output[j].array_base = k; 1827 output[j].op = CF_OP_EXPORT; 1828 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1829 shader->nr_ps_color_exports++; 1830 } 1831 } 1832 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 1833 output[j].array_base = 61; 1834 output[j].swizzle_x = 2; 1835 output[j].swizzle_y = 7; 1836 output[j].swizzle_z = output[j].swizzle_w = 7; 1837 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1838 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 1839 output[j].array_base = 61; 1840 output[j].swizzle_x = 7; 1841 output[j].swizzle_y = 1; 1842 output[j].swizzle_z = output[j].swizzle_w = 7; 1843 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1844 } else { 1845 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 1846 r = -EINVAL; 1847 goto out_err; 1848 } 1849 break; 1850 default: 1851 R600_ERR("unsupported processor type %d\n", ctx.type); 1852 r = -EINVAL; 1853 goto out_err; 1854 } 1855 1856 if (output[j].type==-1) { 1857 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1858 output[j].array_base = next_param_base++; 1859 } 1860 } 1861 1862 /* add fake position export */ 1863 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_pos_base == 60) { 1864 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1865 output[j].gpr = 0; 1866 output[j].elem_size = 3; 1867 output[j].swizzle_x = 7; 1868 output[j].swizzle_y = 7; 1869 output[j].swizzle_z = 7; 1870 output[j].swizzle_w = 7; 1871 output[j].burst_count = 1; 1872 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 1873 output[j].array_base = next_pos_base; 1874 output[j].op = CF_OP_EXPORT; 1875 j++; 1876 } 1877 1878 /* add fake param output for vertex shader if no param is exported */ 1879 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) { 1880 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1881 output[j].gpr = 0; 1882 output[j].elem_size = 3; 1883 output[j].swizzle_x = 7; 1884 output[j].swizzle_y = 7; 1885 output[j].swizzle_z = 7; 1886 output[j].swizzle_w = 7; 1887 output[j].burst_count = 1; 1888 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 1889 output[j].array_base = 0; 1890 output[j].op = CF_OP_EXPORT; 1891 j++; 1892 } 1893 1894 /* add fake pixel export */ 1895 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) { 1896 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 1897 output[j].gpr = 0; 1898 output[j].elem_size = 3; 1899 output[j].swizzle_x = 7; 1900 output[j].swizzle_y = 7; 1901 output[j].swizzle_z = 7; 1902 output[j].swizzle_w = 7; 1903 output[j].burst_count = 1; 1904 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 1905 output[j].array_base = 0; 1906 output[j].op = CF_OP_EXPORT; 1907 j++; 1908 } 1909 1910 noutput = j; 1911 1912 /* set export done on last export of each type */ 1913 for (i = noutput - 1, output_done = 0; i >= 0; i--) { 1914 if (!(output_done & (1 << output[i].type))) { 1915 output_done |= (1 << output[i].type); 1916 output[i].op = CF_OP_EXPORT_DONE; 1917 } 1918 } 1919 /* add output to bytecode */ 1920 if (!use_llvm) { 1921 for (i = 0; i < noutput; i++) { 1922 r = r600_bytecode_add_output(ctx.bc, &output[i]); 1923 if (r) 1924 goto out_err; 1925 } 1926 } 1927 } 1928 1929 /* add program end */ 1930 if (!use_llvm) { 1931 if (ctx.bc->chip_class == CAYMAN) 1932 cm_bytecode_add_cf_end(ctx.bc); 1933 else { 1934 const struct cf_op_info *last = NULL; 1935 1936 if (ctx.bc->cf_last) 1937 last = r600_isa_cf(ctx.bc->cf_last->op); 1938 1939 /* alu clause instructions don't have EOP bit, so add NOP */ 1940 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS) 1941 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 1942 1943 ctx.bc->cf_last->end_of_program = 1; 1944 } 1945 } 1946 1947 /* check GPR limit - we have 124 = 128 - 4 1948 * (4 are reserved as alu clause temporary registers) */ 1949 if (ctx.bc->ngpr > 124) { 1950 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 1951 r = -ENOMEM; 1952 goto out_err; 1953 } 1954 1955 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { 1956 if ((r = generate_gs_copy_shader(rctx, pipeshader))) 1957 return r; 1958 } 1959 1960 free(ctx.literals); 1961 tgsi_parse_free(&ctx.parse); 1962 return 0; 1963out_err: 1964 free(ctx.literals); 1965 tgsi_parse_free(&ctx.parse); 1966 return r; 1967} 1968 1969static int tgsi_unsupported(struct r600_shader_ctx *ctx) 1970{ 1971 R600_ERR("%s tgsi opcode unsupported\n", 1972 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode)); 1973 return -EINVAL; 1974} 1975 1976static int tgsi_end(struct r600_shader_ctx *ctx) 1977{ 1978 return 0; 1979} 1980 1981static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 1982 const struct r600_shader_src *shader_src, 1983 unsigned chan) 1984{ 1985 bc_src->sel = shader_src->sel; 1986 bc_src->chan = shader_src->swizzle[chan]; 1987 bc_src->neg = shader_src->neg; 1988 bc_src->abs = shader_src->abs; 1989 bc_src->rel = shader_src->rel; 1990 bc_src->value = shader_src->value[bc_src->chan]; 1991 bc_src->kc_bank = shader_src->kc_bank; 1992} 1993 1994static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 1995{ 1996 bc_src->abs = 1; 1997 bc_src->neg = 0; 1998} 1999 2000static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 2001{ 2002 bc_src->neg = !bc_src->neg; 2003} 2004 2005static void tgsi_dst(struct r600_shader_ctx *ctx, 2006 const struct tgsi_full_dst_register *tgsi_dst, 2007 unsigned swizzle, 2008 struct r600_bytecode_alu_dst *r600_dst) 2009{ 2010 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2011 2012 r600_dst->sel = tgsi_dst->Register.Index; 2013 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 2014 r600_dst->chan = swizzle; 2015 r600_dst->write = 1; 2016 if (tgsi_dst->Register.Indirect) 2017 r600_dst->rel = V_SQ_REL_RELATIVE; 2018 if (inst->Instruction.Saturate) { 2019 r600_dst->clamp = 1; 2020 } 2021} 2022 2023static int tgsi_last_instruction(unsigned writemask) 2024{ 2025 int i, lasti = 0; 2026 2027 for (i = 0; i < 4; i++) { 2028 if (writemask & (1 << i)) { 2029 lasti = i; 2030 } 2031 } 2032 return lasti; 2033} 2034 2035static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 2036{ 2037 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2038 struct r600_bytecode_alu alu; 2039 unsigned write_mask = inst->Dst[0].Register.WriteMask; 2040 int i, j, r, lasti = tgsi_last_instruction(write_mask); 2041 /* use temp register if trans_only and more than one dst component */ 2042 int use_tmp = trans_only && (write_mask ^ (1 << lasti)); 2043 2044 for (i = 0; i <= lasti; i++) { 2045 if (!(write_mask & (1 << i))) 2046 continue; 2047 2048 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2049 if (use_tmp) { 2050 alu.dst.sel = ctx->temp_reg; 2051 alu.dst.chan = i; 2052 alu.dst.write = 1; 2053 } else 2054 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2055 2056 alu.op = ctx->inst_info->op; 2057 if (!swap) { 2058 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 2059 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 2060 } 2061 } else { 2062 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 2063 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2064 } 2065 /* handle some special cases */ 2066 switch (ctx->inst_info->tgsi_opcode) { 2067 case TGSI_OPCODE_SUB: 2068 r600_bytecode_src_toggle_neg(&alu.src[1]); 2069 break; 2070 case TGSI_OPCODE_ABS: 2071 r600_bytecode_src_set_abs(&alu.src[0]); 2072 break; 2073 default: 2074 break; 2075 } 2076 if (i == lasti || trans_only) { 2077 alu.last = 1; 2078 } 2079 r = r600_bytecode_add_alu(ctx->bc, &alu); 2080 if (r) 2081 return r; 2082 } 2083 2084 if (use_tmp) { 2085 /* move result from temp to dst */ 2086 for (i = 0; i <= lasti; i++) { 2087 if (!(write_mask & (1 << i))) 2088 continue; 2089 2090 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2091 alu.op = ALU_OP1_MOV; 2092 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2093 alu.src[0].sel = ctx->temp_reg; 2094 alu.src[0].chan = i; 2095 alu.last = (i == lasti); 2096 2097 r = r600_bytecode_add_alu(ctx->bc, &alu); 2098 if (r) 2099 return r; 2100 } 2101 } 2102 return 0; 2103} 2104 2105static int tgsi_op2(struct r600_shader_ctx *ctx) 2106{ 2107 return tgsi_op2_s(ctx, 0, 0); 2108} 2109 2110static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 2111{ 2112 return tgsi_op2_s(ctx, 1, 0); 2113} 2114 2115static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 2116{ 2117 return tgsi_op2_s(ctx, 0, 1); 2118} 2119 2120static int tgsi_ineg(struct r600_shader_ctx *ctx) 2121{ 2122 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2123 struct r600_bytecode_alu alu; 2124 int i, r; 2125 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 2126 2127 for (i = 0; i < lasti + 1; i++) { 2128 2129 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 2130 continue; 2131 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2132 alu.op = ctx->inst_info->op; 2133 2134 alu.src[0].sel = V_SQ_ALU_SRC_0; 2135 2136 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2137 2138 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2139 2140 if (i == lasti) { 2141 alu.last = 1; 2142 } 2143 r = r600_bytecode_add_alu(ctx->bc, &alu); 2144 if (r) 2145 return r; 2146 } 2147 return 0; 2148 2149} 2150 2151static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 2152{ 2153 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2154 int i, j, r; 2155 struct r600_bytecode_alu alu; 2156 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2157 2158 for (i = 0 ; i < last_slot; i++) { 2159 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2160 alu.op = ctx->inst_info->op; 2161 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 2162 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 2163 2164 /* RSQ should take the absolute value of src */ 2165 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) { 2166 r600_bytecode_src_set_abs(&alu.src[j]); 2167 } 2168 } 2169 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2170 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2171 2172 if (i == last_slot - 1) 2173 alu.last = 1; 2174 r = r600_bytecode_add_alu(ctx->bc, &alu); 2175 if (r) 2176 return r; 2177 } 2178 return 0; 2179} 2180 2181static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 2182{ 2183 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2184 int i, j, k, r; 2185 struct r600_bytecode_alu alu; 2186 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2187 for (k = 0; k < last_slot; k++) { 2188 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 2189 continue; 2190 2191 for (i = 0 ; i < 4; i++) { 2192 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2193 alu.op = ctx->inst_info->op; 2194 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 2195 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 2196 } 2197 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2198 alu.dst.write = (i == k); 2199 if (i == 3) 2200 alu.last = 1; 2201 r = r600_bytecode_add_alu(ctx->bc, &alu); 2202 if (r) 2203 return r; 2204 } 2205 } 2206 return 0; 2207} 2208 2209/* 2210 * r600 - trunc to -PI..PI range 2211 * r700 - normalize by dividing by 2PI 2212 * see fdo bug 27901 2213 */ 2214static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 2215{ 2216 static float half_inv_pi = 1.0 /(3.1415926535 * 2); 2217 static float double_pi = 3.1415926535 * 2; 2218 static float neg_pi = -3.1415926535; 2219 2220 int r; 2221 struct r600_bytecode_alu alu; 2222 2223 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2224 alu.op = ALU_OP3_MULADD; 2225 alu.is_op3 = 1; 2226 2227 alu.dst.chan = 0; 2228 alu.dst.sel = ctx->temp_reg; 2229 alu.dst.write = 1; 2230 2231 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2232 2233 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2234 alu.src[1].chan = 0; 2235 alu.src[1].value = *(uint32_t *)&half_inv_pi; 2236 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 2237 alu.src[2].chan = 0; 2238 alu.last = 1; 2239 r = r600_bytecode_add_alu(ctx->bc, &alu); 2240 if (r) 2241 return r; 2242 2243 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2244 alu.op = ALU_OP1_FRACT; 2245 2246 alu.dst.chan = 0; 2247 alu.dst.sel = ctx->temp_reg; 2248 alu.dst.write = 1; 2249 2250 alu.src[0].sel = ctx->temp_reg; 2251 alu.src[0].chan = 0; 2252 alu.last = 1; 2253 r = r600_bytecode_add_alu(ctx->bc, &alu); 2254 if (r) 2255 return r; 2256 2257 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2258 alu.op = ALU_OP3_MULADD; 2259 alu.is_op3 = 1; 2260 2261 alu.dst.chan = 0; 2262 alu.dst.sel = ctx->temp_reg; 2263 alu.dst.write = 1; 2264 2265 alu.src[0].sel = ctx->temp_reg; 2266 alu.src[0].chan = 0; 2267 2268 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2269 alu.src[1].chan = 0; 2270 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 2271 alu.src[2].chan = 0; 2272 2273 if (ctx->bc->chip_class == R600) { 2274 alu.src[1].value = *(uint32_t *)&double_pi; 2275 alu.src[2].value = *(uint32_t *)&neg_pi; 2276 } else { 2277 alu.src[1].sel = V_SQ_ALU_SRC_1; 2278 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 2279 alu.src[2].neg = 1; 2280 } 2281 2282 alu.last = 1; 2283 r = r600_bytecode_add_alu(ctx->bc, &alu); 2284 if (r) 2285 return r; 2286 return 0; 2287} 2288 2289static int cayman_trig(struct r600_shader_ctx *ctx) 2290{ 2291 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2292 struct r600_bytecode_alu alu; 2293 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2294 int i, r; 2295 2296 r = tgsi_setup_trig(ctx); 2297 if (r) 2298 return r; 2299 2300 2301 for (i = 0; i < last_slot; i++) { 2302 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2303 alu.op = ctx->inst_info->op; 2304 alu.dst.chan = i; 2305 2306 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2307 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2308 2309 alu.src[0].sel = ctx->temp_reg; 2310 alu.src[0].chan = 0; 2311 if (i == last_slot - 1) 2312 alu.last = 1; 2313 r = r600_bytecode_add_alu(ctx->bc, &alu); 2314 if (r) 2315 return r; 2316 } 2317 return 0; 2318} 2319 2320static int tgsi_trig(struct r600_shader_ctx *ctx) 2321{ 2322 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2323 struct r600_bytecode_alu alu; 2324 int i, r; 2325 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 2326 2327 r = tgsi_setup_trig(ctx); 2328 if (r) 2329 return r; 2330 2331 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2332 alu.op = ctx->inst_info->op; 2333 alu.dst.chan = 0; 2334 alu.dst.sel = ctx->temp_reg; 2335 alu.dst.write = 1; 2336 2337 alu.src[0].sel = ctx->temp_reg; 2338 alu.src[0].chan = 0; 2339 alu.last = 1; 2340 r = r600_bytecode_add_alu(ctx->bc, &alu); 2341 if (r) 2342 return r; 2343 2344 /* replicate result */ 2345 for (i = 0; i < lasti + 1; i++) { 2346 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 2347 continue; 2348 2349 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2350 alu.op = ALU_OP1_MOV; 2351 2352 alu.src[0].sel = ctx->temp_reg; 2353 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2354 if (i == lasti) 2355 alu.last = 1; 2356 r = r600_bytecode_add_alu(ctx->bc, &alu); 2357 if (r) 2358 return r; 2359 } 2360 return 0; 2361} 2362 2363static int tgsi_scs(struct r600_shader_ctx *ctx) 2364{ 2365 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2366 struct r600_bytecode_alu alu; 2367 int i, r; 2368 2369 /* We'll only need the trig stuff if we are going to write to the 2370 * X or Y components of the destination vector. 2371 */ 2372 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) { 2373 r = tgsi_setup_trig(ctx); 2374 if (r) 2375 return r; 2376 } 2377 2378 /* dst.x = COS */ 2379 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) { 2380 if (ctx->bc->chip_class == CAYMAN) { 2381 for (i = 0 ; i < 3; i++) { 2382 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2383 alu.op = ALU_OP1_COS; 2384 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2385 2386 if (i == 0) 2387 alu.dst.write = 1; 2388 else 2389 alu.dst.write = 0; 2390 alu.src[0].sel = ctx->temp_reg; 2391 alu.src[0].chan = 0; 2392 if (i == 2) 2393 alu.last = 1; 2394 r = r600_bytecode_add_alu(ctx->bc, &alu); 2395 if (r) 2396 return r; 2397 } 2398 } else { 2399 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2400 alu.op = ALU_OP1_COS; 2401 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 2402 2403 alu.src[0].sel = ctx->temp_reg; 2404 alu.src[0].chan = 0; 2405 alu.last = 1; 2406 r = r600_bytecode_add_alu(ctx->bc, &alu); 2407 if (r) 2408 return r; 2409 } 2410 } 2411 2412 /* dst.y = SIN */ 2413 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) { 2414 if (ctx->bc->chip_class == CAYMAN) { 2415 for (i = 0 ; i < 3; i++) { 2416 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2417 alu.op = ALU_OP1_SIN; 2418 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2419 if (i == 1) 2420 alu.dst.write = 1; 2421 else 2422 alu.dst.write = 0; 2423 alu.src[0].sel = ctx->temp_reg; 2424 alu.src[0].chan = 0; 2425 if (i == 2) 2426 alu.last = 1; 2427 r = r600_bytecode_add_alu(ctx->bc, &alu); 2428 if (r) 2429 return r; 2430 } 2431 } else { 2432 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2433 alu.op = ALU_OP1_SIN; 2434 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 2435 2436 alu.src[0].sel = ctx->temp_reg; 2437 alu.src[0].chan = 0; 2438 alu.last = 1; 2439 r = r600_bytecode_add_alu(ctx->bc, &alu); 2440 if (r) 2441 return r; 2442 } 2443 } 2444 2445 /* dst.z = 0.0; */ 2446 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) { 2447 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2448 2449 alu.op = ALU_OP1_MOV; 2450 2451 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 2452 2453 alu.src[0].sel = V_SQ_ALU_SRC_0; 2454 alu.src[0].chan = 0; 2455 2456 alu.last = 1; 2457 2458 r = r600_bytecode_add_alu(ctx->bc, &alu); 2459 if (r) 2460 return r; 2461 } 2462 2463 /* dst.w = 1.0; */ 2464 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) { 2465 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2466 2467 alu.op = ALU_OP1_MOV; 2468 2469 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 2470 2471 alu.src[0].sel = V_SQ_ALU_SRC_1; 2472 alu.src[0].chan = 0; 2473 2474 alu.last = 1; 2475 2476 r = r600_bytecode_add_alu(ctx->bc, &alu); 2477 if (r) 2478 return r; 2479 } 2480 2481 return 0; 2482} 2483 2484static int tgsi_kill(struct r600_shader_ctx *ctx) 2485{ 2486 struct r600_bytecode_alu alu; 2487 int i, r; 2488 2489 for (i = 0; i < 4; i++) { 2490 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2491 alu.op = ctx->inst_info->op; 2492 2493 alu.dst.chan = i; 2494 2495 alu.src[0].sel = V_SQ_ALU_SRC_0; 2496 2497 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILL) { 2498 alu.src[1].sel = V_SQ_ALU_SRC_1; 2499 alu.src[1].neg = 1; 2500 } else { 2501 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2502 } 2503 if (i == 3) { 2504 alu.last = 1; 2505 } 2506 r = r600_bytecode_add_alu(ctx->bc, &alu); 2507 if (r) 2508 return r; 2509 } 2510 2511 /* kill must be last in ALU */ 2512 ctx->bc->force_add_cf = 1; 2513 ctx->shader->uses_kill = TRUE; 2514 return 0; 2515} 2516 2517static int tgsi_lit(struct r600_shader_ctx *ctx) 2518{ 2519 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2520 struct r600_bytecode_alu alu; 2521 int r; 2522 2523 /* tmp.x = max(src.y, 0.0) */ 2524 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2525 alu.op = ALU_OP2_MAX; 2526 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 2527 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 2528 alu.src[1].chan = 1; 2529 2530 alu.dst.sel = ctx->temp_reg; 2531 alu.dst.chan = 0; 2532 alu.dst.write = 1; 2533 2534 alu.last = 1; 2535 r = r600_bytecode_add_alu(ctx->bc, &alu); 2536 if (r) 2537 return r; 2538 2539 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 2540 { 2541 int chan; 2542 int sel; 2543 int i; 2544 2545 if (ctx->bc->chip_class == CAYMAN) { 2546 for (i = 0; i < 3; i++) { 2547 /* tmp.z = log(tmp.x) */ 2548 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2549 alu.op = ALU_OP1_LOG_CLAMPED; 2550 alu.src[0].sel = ctx->temp_reg; 2551 alu.src[0].chan = 0; 2552 alu.dst.sel = ctx->temp_reg; 2553 alu.dst.chan = i; 2554 if (i == 2) { 2555 alu.dst.write = 1; 2556 alu.last = 1; 2557 } else 2558 alu.dst.write = 0; 2559 2560 r = r600_bytecode_add_alu(ctx->bc, &alu); 2561 if (r) 2562 return r; 2563 } 2564 } else { 2565 /* tmp.z = log(tmp.x) */ 2566 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2567 alu.op = ALU_OP1_LOG_CLAMPED; 2568 alu.src[0].sel = ctx->temp_reg; 2569 alu.src[0].chan = 0; 2570 alu.dst.sel = ctx->temp_reg; 2571 alu.dst.chan = 2; 2572 alu.dst.write = 1; 2573 alu.last = 1; 2574 r = r600_bytecode_add_alu(ctx->bc, &alu); 2575 if (r) 2576 return r; 2577 } 2578 2579 chan = alu.dst.chan; 2580 sel = alu.dst.sel; 2581 2582 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 2583 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2584 alu.op = ALU_OP3_MUL_LIT; 2585 alu.src[0].sel = sel; 2586 alu.src[0].chan = chan; 2587 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 2588 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 2589 alu.dst.sel = ctx->temp_reg; 2590 alu.dst.chan = 0; 2591 alu.dst.write = 1; 2592 alu.is_op3 = 1; 2593 alu.last = 1; 2594 r = r600_bytecode_add_alu(ctx->bc, &alu); 2595 if (r) 2596 return r; 2597 2598 if (ctx->bc->chip_class == CAYMAN) { 2599 for (i = 0; i < 3; i++) { 2600 /* dst.z = exp(tmp.x) */ 2601 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2602 alu.op = ALU_OP1_EXP_IEEE; 2603 alu.src[0].sel = ctx->temp_reg; 2604 alu.src[0].chan = 0; 2605 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2606 if (i == 2) { 2607 alu.dst.write = 1; 2608 alu.last = 1; 2609 } else 2610 alu.dst.write = 0; 2611 r = r600_bytecode_add_alu(ctx->bc, &alu); 2612 if (r) 2613 return r; 2614 } 2615 } else { 2616 /* dst.z = exp(tmp.x) */ 2617 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2618 alu.op = ALU_OP1_EXP_IEEE; 2619 alu.src[0].sel = ctx->temp_reg; 2620 alu.src[0].chan = 0; 2621 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 2622 alu.last = 1; 2623 r = r600_bytecode_add_alu(ctx->bc, &alu); 2624 if (r) 2625 return r; 2626 } 2627 } 2628 2629 /* dst.x, <- 1.0 */ 2630 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2631 alu.op = ALU_OP1_MOV; 2632 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 2633 alu.src[0].chan = 0; 2634 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 2635 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 2636 r = r600_bytecode_add_alu(ctx->bc, &alu); 2637 if (r) 2638 return r; 2639 2640 /* dst.y = max(src.x, 0.0) */ 2641 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2642 alu.op = ALU_OP2_MAX; 2643 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2644 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 2645 alu.src[1].chan = 0; 2646 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 2647 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 2648 r = r600_bytecode_add_alu(ctx->bc, &alu); 2649 if (r) 2650 return r; 2651 2652 /* dst.w, <- 1.0 */ 2653 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2654 alu.op = ALU_OP1_MOV; 2655 alu.src[0].sel = V_SQ_ALU_SRC_1; 2656 alu.src[0].chan = 0; 2657 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 2658 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 2659 alu.last = 1; 2660 r = r600_bytecode_add_alu(ctx->bc, &alu); 2661 if (r) 2662 return r; 2663 2664 return 0; 2665} 2666 2667static int tgsi_rsq(struct r600_shader_ctx *ctx) 2668{ 2669 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2670 struct r600_bytecode_alu alu; 2671 int i, r; 2672 2673 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2674 2675 /* XXX: 2676 * For state trackers other than OpenGL, we'll want to use 2677 * _RECIPSQRT_IEEE instead. 2678 */ 2679 alu.op = ALU_OP1_RECIPSQRT_CLAMPED; 2680 2681 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2682 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 2683 r600_bytecode_src_set_abs(&alu.src[i]); 2684 } 2685 alu.dst.sel = ctx->temp_reg; 2686 alu.dst.write = 1; 2687 alu.last = 1; 2688 r = r600_bytecode_add_alu(ctx->bc, &alu); 2689 if (r) 2690 return r; 2691 /* replicate result */ 2692 return tgsi_helper_tempx_replicate(ctx); 2693} 2694 2695static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 2696{ 2697 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2698 struct r600_bytecode_alu alu; 2699 int i, r; 2700 2701 for (i = 0; i < 4; i++) { 2702 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2703 alu.src[0].sel = ctx->temp_reg; 2704 alu.op = ALU_OP1_MOV; 2705 alu.dst.chan = i; 2706 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2707 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2708 if (i == 3) 2709 alu.last = 1; 2710 r = r600_bytecode_add_alu(ctx->bc, &alu); 2711 if (r) 2712 return r; 2713 } 2714 return 0; 2715} 2716 2717static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 2718{ 2719 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2720 struct r600_bytecode_alu alu; 2721 int i, r; 2722 2723 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2724 alu.op = ctx->inst_info->op; 2725 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2726 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 2727 } 2728 alu.dst.sel = ctx->temp_reg; 2729 alu.dst.write = 1; 2730 alu.last = 1; 2731 r = r600_bytecode_add_alu(ctx->bc, &alu); 2732 if (r) 2733 return r; 2734 /* replicate result */ 2735 return tgsi_helper_tempx_replicate(ctx); 2736} 2737 2738static int cayman_pow(struct r600_shader_ctx *ctx) 2739{ 2740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2741 int i, r; 2742 struct r600_bytecode_alu alu; 2743 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 2744 2745 for (i = 0; i < 3; i++) { 2746 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2747 alu.op = ALU_OP1_LOG_IEEE; 2748 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2749 alu.dst.sel = ctx->temp_reg; 2750 alu.dst.chan = i; 2751 alu.dst.write = 1; 2752 if (i == 2) 2753 alu.last = 1; 2754 r = r600_bytecode_add_alu(ctx->bc, &alu); 2755 if (r) 2756 return r; 2757 } 2758 2759 /* b * LOG2(a) */ 2760 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2761 alu.op = ALU_OP2_MUL; 2762 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 2763 alu.src[1].sel = ctx->temp_reg; 2764 alu.dst.sel = ctx->temp_reg; 2765 alu.dst.write = 1; 2766 alu.last = 1; 2767 r = r600_bytecode_add_alu(ctx->bc, &alu); 2768 if (r) 2769 return r; 2770 2771 for (i = 0; i < last_slot; i++) { 2772 /* POW(a,b) = EXP2(b * LOG2(a))*/ 2773 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2774 alu.op = ALU_OP1_EXP_IEEE; 2775 alu.src[0].sel = ctx->temp_reg; 2776 2777 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 2778 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 2779 if (i == last_slot - 1) 2780 alu.last = 1; 2781 r = r600_bytecode_add_alu(ctx->bc, &alu); 2782 if (r) 2783 return r; 2784 } 2785 return 0; 2786} 2787 2788static int tgsi_pow(struct r600_shader_ctx *ctx) 2789{ 2790 struct r600_bytecode_alu alu; 2791 int r; 2792 2793 /* LOG2(a) */ 2794 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2795 alu.op = ALU_OP1_LOG_IEEE; 2796 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 2797 alu.dst.sel = ctx->temp_reg; 2798 alu.dst.write = 1; 2799 alu.last = 1; 2800 r = r600_bytecode_add_alu(ctx->bc, &alu); 2801 if (r) 2802 return r; 2803 /* b * LOG2(a) */ 2804 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2805 alu.op = ALU_OP2_MUL; 2806 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 2807 alu.src[1].sel = ctx->temp_reg; 2808 alu.dst.sel = ctx->temp_reg; 2809 alu.dst.write = 1; 2810 alu.last = 1; 2811 r = r600_bytecode_add_alu(ctx->bc, &alu); 2812 if (r) 2813 return r; 2814 /* POW(a,b) = EXP2(b * LOG2(a))*/ 2815 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2816 alu.op = ALU_OP1_EXP_IEEE; 2817 alu.src[0].sel = ctx->temp_reg; 2818 alu.dst.sel = ctx->temp_reg; 2819 alu.dst.write = 1; 2820 alu.last = 1; 2821 r = r600_bytecode_add_alu(ctx->bc, &alu); 2822 if (r) 2823 return r; 2824 return tgsi_helper_tempx_replicate(ctx); 2825} 2826 2827static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 2828{ 2829 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2830 struct r600_bytecode_alu alu; 2831 int i, r, j; 2832 unsigned write_mask = inst->Dst[0].Register.WriteMask; 2833 int tmp0 = ctx->temp_reg; 2834 int tmp1 = r600_get_temp(ctx); 2835 int tmp2 = r600_get_temp(ctx); 2836 int tmp3 = r600_get_temp(ctx); 2837 /* Unsigned path: 2838 * 2839 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 2840 * 2841 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 2842 * 2. tmp0.z = lo (tmp0.x * src2) 2843 * 3. tmp0.w = -tmp0.z 2844 * 4. tmp0.y = hi (tmp0.x * src2) 2845 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 2846 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 2847 * 7. tmp1.x = tmp0.x - tmp0.w 2848 * 8. tmp1.y = tmp0.x + tmp0.w 2849 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 2850 * 10. tmp0.z = hi(tmp0.x * src1) = q 2851 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 2852 * 2853 * 12. tmp0.w = src1 - tmp0.y = r 2854 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 2855 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 2856 * 2857 * if DIV 2858 * 2859 * 15. tmp1.z = tmp0.z + 1 = q + 1 2860 * 16. tmp1.w = tmp0.z - 1 = q - 1 2861 * 2862 * else MOD 2863 * 2864 * 15. tmp1.z = tmp0.w - src2 = r - src2 2865 * 16. tmp1.w = tmp0.w + src2 = r + src2 2866 * 2867 * endif 2868 * 2869 * 17. tmp1.x = tmp1.x & tmp1.y 2870 * 2871 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 2872 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 2873 * 2874 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 2875 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 2876 * 2877 * Signed path: 2878 * 2879 * Same as unsigned, using abs values of the operands, 2880 * and fixing the sign of the result in the end. 2881 */ 2882 2883 for (i = 0; i < 4; i++) { 2884 if (!(write_mask & (1<<i))) 2885 continue; 2886 2887 if (signed_op) { 2888 2889 /* tmp2.x = -src0 */ 2890 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2891 alu.op = ALU_OP2_SUB_INT; 2892 2893 alu.dst.sel = tmp2; 2894 alu.dst.chan = 0; 2895 alu.dst.write = 1; 2896 2897 alu.src[0].sel = V_SQ_ALU_SRC_0; 2898 2899 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2900 2901 alu.last = 1; 2902 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2903 return r; 2904 2905 /* tmp2.y = -src1 */ 2906 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2907 alu.op = ALU_OP2_SUB_INT; 2908 2909 alu.dst.sel = tmp2; 2910 alu.dst.chan = 1; 2911 alu.dst.write = 1; 2912 2913 alu.src[0].sel = V_SQ_ALU_SRC_0; 2914 2915 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 2916 2917 alu.last = 1; 2918 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2919 return r; 2920 2921 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 2922 /* it will be a sign of the quotient */ 2923 if (!mod) { 2924 2925 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2926 alu.op = ALU_OP2_XOR_INT; 2927 2928 alu.dst.sel = tmp2; 2929 alu.dst.chan = 2; 2930 alu.dst.write = 1; 2931 2932 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 2933 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 2934 2935 alu.last = 1; 2936 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2937 return r; 2938 } 2939 2940 /* tmp2.x = |src0| */ 2941 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2942 alu.op = ALU_OP3_CNDGE_INT; 2943 alu.is_op3 = 1; 2944 2945 alu.dst.sel = tmp2; 2946 alu.dst.chan = 0; 2947 alu.dst.write = 1; 2948 2949 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 2950 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 2951 alu.src[2].sel = tmp2; 2952 alu.src[2].chan = 0; 2953 2954 alu.last = 1; 2955 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2956 return r; 2957 2958 /* tmp2.y = |src1| */ 2959 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2960 alu.op = ALU_OP3_CNDGE_INT; 2961 alu.is_op3 = 1; 2962 2963 alu.dst.sel = tmp2; 2964 alu.dst.chan = 1; 2965 alu.dst.write = 1; 2966 2967 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 2968 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 2969 alu.src[2].sel = tmp2; 2970 alu.src[2].chan = 1; 2971 2972 alu.last = 1; 2973 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2974 return r; 2975 2976 } 2977 2978 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 2979 if (ctx->bc->chip_class == CAYMAN) { 2980 /* tmp3.x = u2f(src2) */ 2981 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2982 alu.op = ALU_OP1_UINT_TO_FLT; 2983 2984 alu.dst.sel = tmp3; 2985 alu.dst.chan = 0; 2986 alu.dst.write = 1; 2987 2988 if (signed_op) { 2989 alu.src[0].sel = tmp2; 2990 alu.src[0].chan = 1; 2991 } else { 2992 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 2993 } 2994 2995 alu.last = 1; 2996 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 2997 return r; 2998 2999 /* tmp0.x = recip(tmp3.x) */ 3000 for (j = 0 ; j < 3; j++) { 3001 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3002 alu.op = ALU_OP1_RECIP_IEEE; 3003 3004 alu.dst.sel = tmp0; 3005 alu.dst.chan = j; 3006 alu.dst.write = (j == 0); 3007 3008 alu.src[0].sel = tmp3; 3009 alu.src[0].chan = 0; 3010 3011 if (j == 2) 3012 alu.last = 1; 3013 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3014 return r; 3015 } 3016 3017 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3018 alu.op = ALU_OP2_MUL; 3019 3020 alu.src[0].sel = tmp0; 3021 alu.src[0].chan = 0; 3022 3023 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3024 alu.src[1].value = 0x4f800000; 3025 3026 alu.dst.sel = tmp3; 3027 alu.dst.write = 1; 3028 alu.last = 1; 3029 r = r600_bytecode_add_alu(ctx->bc, &alu); 3030 if (r) 3031 return r; 3032 3033 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3034 alu.op = ALU_OP1_FLT_TO_UINT; 3035 3036 alu.dst.sel = tmp0; 3037 alu.dst.chan = 0; 3038 alu.dst.write = 1; 3039 3040 alu.src[0].sel = tmp3; 3041 alu.src[0].chan = 0; 3042 3043 alu.last = 1; 3044 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3045 return r; 3046 3047 } else { 3048 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3049 alu.op = ALU_OP1_RECIP_UINT; 3050 3051 alu.dst.sel = tmp0; 3052 alu.dst.chan = 0; 3053 alu.dst.write = 1; 3054 3055 if (signed_op) { 3056 alu.src[0].sel = tmp2; 3057 alu.src[0].chan = 1; 3058 } else { 3059 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 3060 } 3061 3062 alu.last = 1; 3063 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3064 return r; 3065 } 3066 3067 /* 2. tmp0.z = lo (tmp0.x * src2) */ 3068 if (ctx->bc->chip_class == CAYMAN) { 3069 for (j = 0 ; j < 4; j++) { 3070 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3071 alu.op = ALU_OP2_MULLO_UINT; 3072 3073 alu.dst.sel = tmp0; 3074 alu.dst.chan = j; 3075 alu.dst.write = (j == 2); 3076 3077 alu.src[0].sel = tmp0; 3078 alu.src[0].chan = 0; 3079 if (signed_op) { 3080 alu.src[1].sel = tmp2; 3081 alu.src[1].chan = 1; 3082 } else { 3083 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3084 } 3085 3086 alu.last = (j == 3); 3087 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3088 return r; 3089 } 3090 } else { 3091 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3092 alu.op = ALU_OP2_MULLO_UINT; 3093 3094 alu.dst.sel = tmp0; 3095 alu.dst.chan = 2; 3096 alu.dst.write = 1; 3097 3098 alu.src[0].sel = tmp0; 3099 alu.src[0].chan = 0; 3100 if (signed_op) { 3101 alu.src[1].sel = tmp2; 3102 alu.src[1].chan = 1; 3103 } else { 3104 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3105 } 3106 3107 alu.last = 1; 3108 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3109 return r; 3110 } 3111 3112 /* 3. tmp0.w = -tmp0.z */ 3113 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3114 alu.op = ALU_OP2_SUB_INT; 3115 3116 alu.dst.sel = tmp0; 3117 alu.dst.chan = 3; 3118 alu.dst.write = 1; 3119 3120 alu.src[0].sel = V_SQ_ALU_SRC_0; 3121 alu.src[1].sel = tmp0; 3122 alu.src[1].chan = 2; 3123 3124 alu.last = 1; 3125 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3126 return r; 3127 3128 /* 4. tmp0.y = hi (tmp0.x * src2) */ 3129 if (ctx->bc->chip_class == CAYMAN) { 3130 for (j = 0 ; j < 4; j++) { 3131 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3132 alu.op = ALU_OP2_MULHI_UINT; 3133 3134 alu.dst.sel = tmp0; 3135 alu.dst.chan = j; 3136 alu.dst.write = (j == 1); 3137 3138 alu.src[0].sel = tmp0; 3139 alu.src[0].chan = 0; 3140 3141 if (signed_op) { 3142 alu.src[1].sel = tmp2; 3143 alu.src[1].chan = 1; 3144 } else { 3145 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3146 } 3147 alu.last = (j == 3); 3148 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3149 return r; 3150 } 3151 } else { 3152 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3153 alu.op = ALU_OP2_MULHI_UINT; 3154 3155 alu.dst.sel = tmp0; 3156 alu.dst.chan = 1; 3157 alu.dst.write = 1; 3158 3159 alu.src[0].sel = tmp0; 3160 alu.src[0].chan = 0; 3161 3162 if (signed_op) { 3163 alu.src[1].sel = tmp2; 3164 alu.src[1].chan = 1; 3165 } else { 3166 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3167 } 3168 3169 alu.last = 1; 3170 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3171 return r; 3172 } 3173 3174 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 3175 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3176 alu.op = ALU_OP3_CNDE_INT; 3177 alu.is_op3 = 1; 3178 3179 alu.dst.sel = tmp0; 3180 alu.dst.chan = 2; 3181 alu.dst.write = 1; 3182 3183 alu.src[0].sel = tmp0; 3184 alu.src[0].chan = 1; 3185 alu.src[1].sel = tmp0; 3186 alu.src[1].chan = 3; 3187 alu.src[2].sel = tmp0; 3188 alu.src[2].chan = 2; 3189 3190 alu.last = 1; 3191 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3192 return r; 3193 3194 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 3195 if (ctx->bc->chip_class == CAYMAN) { 3196 for (j = 0 ; j < 4; j++) { 3197 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3198 alu.op = ALU_OP2_MULHI_UINT; 3199 3200 alu.dst.sel = tmp0; 3201 alu.dst.chan = j; 3202 alu.dst.write = (j == 3); 3203 3204 alu.src[0].sel = tmp0; 3205 alu.src[0].chan = 2; 3206 3207 alu.src[1].sel = tmp0; 3208 alu.src[1].chan = 0; 3209 3210 alu.last = (j == 3); 3211 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3212 return r; 3213 } 3214 } else { 3215 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3216 alu.op = ALU_OP2_MULHI_UINT; 3217 3218 alu.dst.sel = tmp0; 3219 alu.dst.chan = 3; 3220 alu.dst.write = 1; 3221 3222 alu.src[0].sel = tmp0; 3223 alu.src[0].chan = 2; 3224 3225 alu.src[1].sel = tmp0; 3226 alu.src[1].chan = 0; 3227 3228 alu.last = 1; 3229 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3230 return r; 3231 } 3232 3233 /* 7. tmp1.x = tmp0.x - tmp0.w */ 3234 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3235 alu.op = ALU_OP2_SUB_INT; 3236 3237 alu.dst.sel = tmp1; 3238 alu.dst.chan = 0; 3239 alu.dst.write = 1; 3240 3241 alu.src[0].sel = tmp0; 3242 alu.src[0].chan = 0; 3243 alu.src[1].sel = tmp0; 3244 alu.src[1].chan = 3; 3245 3246 alu.last = 1; 3247 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3248 return r; 3249 3250 /* 8. tmp1.y = tmp0.x + tmp0.w */ 3251 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3252 alu.op = ALU_OP2_ADD_INT; 3253 3254 alu.dst.sel = tmp1; 3255 alu.dst.chan = 1; 3256 alu.dst.write = 1; 3257 3258 alu.src[0].sel = tmp0; 3259 alu.src[0].chan = 0; 3260 alu.src[1].sel = tmp0; 3261 alu.src[1].chan = 3; 3262 3263 alu.last = 1; 3264 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3265 return r; 3266 3267 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 3268 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3269 alu.op = ALU_OP3_CNDE_INT; 3270 alu.is_op3 = 1; 3271 3272 alu.dst.sel = tmp0; 3273 alu.dst.chan = 0; 3274 alu.dst.write = 1; 3275 3276 alu.src[0].sel = tmp0; 3277 alu.src[0].chan = 1; 3278 alu.src[1].sel = tmp1; 3279 alu.src[1].chan = 1; 3280 alu.src[2].sel = tmp1; 3281 alu.src[2].chan = 0; 3282 3283 alu.last = 1; 3284 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3285 return r; 3286 3287 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 3288 if (ctx->bc->chip_class == CAYMAN) { 3289 for (j = 0 ; j < 4; j++) { 3290 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3291 alu.op = ALU_OP2_MULHI_UINT; 3292 3293 alu.dst.sel = tmp0; 3294 alu.dst.chan = j; 3295 alu.dst.write = (j == 2); 3296 3297 alu.src[0].sel = tmp0; 3298 alu.src[0].chan = 0; 3299 3300 if (signed_op) { 3301 alu.src[1].sel = tmp2; 3302 alu.src[1].chan = 0; 3303 } else { 3304 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3305 } 3306 3307 alu.last = (j == 3); 3308 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3309 return r; 3310 } 3311 } else { 3312 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3313 alu.op = ALU_OP2_MULHI_UINT; 3314 3315 alu.dst.sel = tmp0; 3316 alu.dst.chan = 2; 3317 alu.dst.write = 1; 3318 3319 alu.src[0].sel = tmp0; 3320 alu.src[0].chan = 0; 3321 3322 if (signed_op) { 3323 alu.src[1].sel = tmp2; 3324 alu.src[1].chan = 0; 3325 } else { 3326 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3327 } 3328 3329 alu.last = 1; 3330 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3331 return r; 3332 } 3333 3334 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 3335 if (ctx->bc->chip_class == CAYMAN) { 3336 for (j = 0 ; j < 4; j++) { 3337 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3338 alu.op = ALU_OP2_MULLO_UINT; 3339 3340 alu.dst.sel = tmp0; 3341 alu.dst.chan = j; 3342 alu.dst.write = (j == 1); 3343 3344 if (signed_op) { 3345 alu.src[0].sel = tmp2; 3346 alu.src[0].chan = 1; 3347 } else { 3348 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 3349 } 3350 3351 alu.src[1].sel = tmp0; 3352 alu.src[1].chan = 2; 3353 3354 alu.last = (j == 3); 3355 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3356 return r; 3357 } 3358 } else { 3359 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3360 alu.op = ALU_OP2_MULLO_UINT; 3361 3362 alu.dst.sel = tmp0; 3363 alu.dst.chan = 1; 3364 alu.dst.write = 1; 3365 3366 if (signed_op) { 3367 alu.src[0].sel = tmp2; 3368 alu.src[0].chan = 1; 3369 } else { 3370 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 3371 } 3372 3373 alu.src[1].sel = tmp0; 3374 alu.src[1].chan = 2; 3375 3376 alu.last = 1; 3377 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3378 return r; 3379 } 3380 3381 /* 12. tmp0.w = src1 - tmp0.y = r */ 3382 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3383 alu.op = ALU_OP2_SUB_INT; 3384 3385 alu.dst.sel = tmp0; 3386 alu.dst.chan = 3; 3387 alu.dst.write = 1; 3388 3389 if (signed_op) { 3390 alu.src[0].sel = tmp2; 3391 alu.src[0].chan = 0; 3392 } else { 3393 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3394 } 3395 3396 alu.src[1].sel = tmp0; 3397 alu.src[1].chan = 1; 3398 3399 alu.last = 1; 3400 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3401 return r; 3402 3403 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 3404 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3405 alu.op = ALU_OP2_SETGE_UINT; 3406 3407 alu.dst.sel = tmp1; 3408 alu.dst.chan = 0; 3409 alu.dst.write = 1; 3410 3411 alu.src[0].sel = tmp0; 3412 alu.src[0].chan = 3; 3413 if (signed_op) { 3414 alu.src[1].sel = tmp2; 3415 alu.src[1].chan = 1; 3416 } else { 3417 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3418 } 3419 3420 alu.last = 1; 3421 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3422 return r; 3423 3424 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 3425 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3426 alu.op = ALU_OP2_SETGE_UINT; 3427 3428 alu.dst.sel = tmp1; 3429 alu.dst.chan = 1; 3430 alu.dst.write = 1; 3431 3432 if (signed_op) { 3433 alu.src[0].sel = tmp2; 3434 alu.src[0].chan = 0; 3435 } else { 3436 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3437 } 3438 3439 alu.src[1].sel = tmp0; 3440 alu.src[1].chan = 1; 3441 3442 alu.last = 1; 3443 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3444 return r; 3445 3446 if (mod) { /* UMOD */ 3447 3448 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 3449 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3450 alu.op = ALU_OP2_SUB_INT; 3451 3452 alu.dst.sel = tmp1; 3453 alu.dst.chan = 2; 3454 alu.dst.write = 1; 3455 3456 alu.src[0].sel = tmp0; 3457 alu.src[0].chan = 3; 3458 3459 if (signed_op) { 3460 alu.src[1].sel = tmp2; 3461 alu.src[1].chan = 1; 3462 } else { 3463 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3464 } 3465 3466 alu.last = 1; 3467 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3468 return r; 3469 3470 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 3471 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3472 alu.op = ALU_OP2_ADD_INT; 3473 3474 alu.dst.sel = tmp1; 3475 alu.dst.chan = 3; 3476 alu.dst.write = 1; 3477 3478 alu.src[0].sel = tmp0; 3479 alu.src[0].chan = 3; 3480 if (signed_op) { 3481 alu.src[1].sel = tmp2; 3482 alu.src[1].chan = 1; 3483 } else { 3484 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 3485 } 3486 3487 alu.last = 1; 3488 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3489 return r; 3490 3491 } else { /* UDIV */ 3492 3493 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 3494 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3495 alu.op = ALU_OP2_ADD_INT; 3496 3497 alu.dst.sel = tmp1; 3498 alu.dst.chan = 2; 3499 alu.dst.write = 1; 3500 3501 alu.src[0].sel = tmp0; 3502 alu.src[0].chan = 2; 3503 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 3504 3505 alu.last = 1; 3506 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3507 return r; 3508 3509 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 3510 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3511 alu.op = ALU_OP2_ADD_INT; 3512 3513 alu.dst.sel = tmp1; 3514 alu.dst.chan = 3; 3515 alu.dst.write = 1; 3516 3517 alu.src[0].sel = tmp0; 3518 alu.src[0].chan = 2; 3519 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 3520 3521 alu.last = 1; 3522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3523 return r; 3524 3525 } 3526 3527 /* 17. tmp1.x = tmp1.x & tmp1.y */ 3528 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3529 alu.op = ALU_OP2_AND_INT; 3530 3531 alu.dst.sel = tmp1; 3532 alu.dst.chan = 0; 3533 alu.dst.write = 1; 3534 3535 alu.src[0].sel = tmp1; 3536 alu.src[0].chan = 0; 3537 alu.src[1].sel = tmp1; 3538 alu.src[1].chan = 1; 3539 3540 alu.last = 1; 3541 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3542 return r; 3543 3544 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 3545 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 3546 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3547 alu.op = ALU_OP3_CNDE_INT; 3548 alu.is_op3 = 1; 3549 3550 alu.dst.sel = tmp0; 3551 alu.dst.chan = 2; 3552 alu.dst.write = 1; 3553 3554 alu.src[0].sel = tmp1; 3555 alu.src[0].chan = 0; 3556 alu.src[1].sel = tmp0; 3557 alu.src[1].chan = mod ? 3 : 2; 3558 alu.src[2].sel = tmp1; 3559 alu.src[2].chan = 2; 3560 3561 alu.last = 1; 3562 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3563 return r; 3564 3565 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 3566 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3567 alu.op = ALU_OP3_CNDE_INT; 3568 alu.is_op3 = 1; 3569 3570 if (signed_op) { 3571 alu.dst.sel = tmp0; 3572 alu.dst.chan = 2; 3573 alu.dst.write = 1; 3574 } else { 3575 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3576 } 3577 3578 alu.src[0].sel = tmp1; 3579 alu.src[0].chan = 1; 3580 alu.src[1].sel = tmp1; 3581 alu.src[1].chan = 3; 3582 alu.src[2].sel = tmp0; 3583 alu.src[2].chan = 2; 3584 3585 alu.last = 1; 3586 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3587 return r; 3588 3589 if (signed_op) { 3590 3591 /* fix the sign of the result */ 3592 3593 if (mod) { 3594 3595 /* tmp0.x = -tmp0.z */ 3596 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3597 alu.op = ALU_OP2_SUB_INT; 3598 3599 alu.dst.sel = tmp0; 3600 alu.dst.chan = 0; 3601 alu.dst.write = 1; 3602 3603 alu.src[0].sel = V_SQ_ALU_SRC_0; 3604 alu.src[1].sel = tmp0; 3605 alu.src[1].chan = 2; 3606 3607 alu.last = 1; 3608 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3609 return r; 3610 3611 /* sign of the remainder is the same as the sign of src0 */ 3612 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 3613 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3614 alu.op = ALU_OP3_CNDGE_INT; 3615 alu.is_op3 = 1; 3616 3617 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3618 3619 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3620 alu.src[1].sel = tmp0; 3621 alu.src[1].chan = 2; 3622 alu.src[2].sel = tmp0; 3623 alu.src[2].chan = 0; 3624 3625 alu.last = 1; 3626 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3627 return r; 3628 3629 } else { 3630 3631 /* tmp0.x = -tmp0.z */ 3632 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3633 alu.op = ALU_OP2_SUB_INT; 3634 3635 alu.dst.sel = tmp0; 3636 alu.dst.chan = 0; 3637 alu.dst.write = 1; 3638 3639 alu.src[0].sel = V_SQ_ALU_SRC_0; 3640 alu.src[1].sel = tmp0; 3641 alu.src[1].chan = 2; 3642 3643 alu.last = 1; 3644 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3645 return r; 3646 3647 /* fix the quotient sign (same as the sign of src0*src1) */ 3648 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 3649 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3650 alu.op = ALU_OP3_CNDGE_INT; 3651 alu.is_op3 = 1; 3652 3653 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3654 3655 alu.src[0].sel = tmp2; 3656 alu.src[0].chan = 2; 3657 alu.src[1].sel = tmp0; 3658 alu.src[1].chan = 2; 3659 alu.src[2].sel = tmp0; 3660 alu.src[2].chan = 0; 3661 3662 alu.last = 1; 3663 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 3664 return r; 3665 } 3666 } 3667 } 3668 return 0; 3669} 3670 3671static int tgsi_udiv(struct r600_shader_ctx *ctx) 3672{ 3673 return tgsi_divmod(ctx, 0, 0); 3674} 3675 3676static int tgsi_umod(struct r600_shader_ctx *ctx) 3677{ 3678 return tgsi_divmod(ctx, 1, 0); 3679} 3680 3681static int tgsi_idiv(struct r600_shader_ctx *ctx) 3682{ 3683 return tgsi_divmod(ctx, 0, 1); 3684} 3685 3686static int tgsi_imod(struct r600_shader_ctx *ctx) 3687{ 3688 return tgsi_divmod(ctx, 1, 1); 3689} 3690 3691 3692static int tgsi_f2i(struct r600_shader_ctx *ctx) 3693{ 3694 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3695 struct r600_bytecode_alu alu; 3696 int i, r; 3697 unsigned write_mask = inst->Dst[0].Register.WriteMask; 3698 int last_inst = tgsi_last_instruction(write_mask); 3699 3700 for (i = 0; i < 4; i++) { 3701 if (!(write_mask & (1<<i))) 3702 continue; 3703 3704 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3705 alu.op = ALU_OP1_TRUNC; 3706 3707 alu.dst.sel = ctx->temp_reg; 3708 alu.dst.chan = i; 3709 alu.dst.write = 1; 3710 3711 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3712 if (i == last_inst) 3713 alu.last = 1; 3714 r = r600_bytecode_add_alu(ctx->bc, &alu); 3715 if (r) 3716 return r; 3717 } 3718 3719 for (i = 0; i < 4; i++) { 3720 if (!(write_mask & (1<<i))) 3721 continue; 3722 3723 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3724 alu.op = ctx->inst_info->op; 3725 3726 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3727 3728 alu.src[0].sel = ctx->temp_reg; 3729 alu.src[0].chan = i; 3730 3731 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 3732 alu.last = 1; 3733 r = r600_bytecode_add_alu(ctx->bc, &alu); 3734 if (r) 3735 return r; 3736 } 3737 3738 return 0; 3739} 3740 3741static int tgsi_iabs(struct r600_shader_ctx *ctx) 3742{ 3743 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3744 struct r600_bytecode_alu alu; 3745 int i, r; 3746 unsigned write_mask = inst->Dst[0].Register.WriteMask; 3747 int last_inst = tgsi_last_instruction(write_mask); 3748 3749 /* tmp = -src */ 3750 for (i = 0; i < 4; i++) { 3751 if (!(write_mask & (1<<i))) 3752 continue; 3753 3754 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3755 alu.op = ALU_OP2_SUB_INT; 3756 3757 alu.dst.sel = ctx->temp_reg; 3758 alu.dst.chan = i; 3759 alu.dst.write = 1; 3760 3761 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3762 alu.src[0].sel = V_SQ_ALU_SRC_0; 3763 3764 if (i == last_inst) 3765 alu.last = 1; 3766 r = r600_bytecode_add_alu(ctx->bc, &alu); 3767 if (r) 3768 return r; 3769 } 3770 3771 /* dst = (src >= 0 ? src : tmp) */ 3772 for (i = 0; i < 4; i++) { 3773 if (!(write_mask & (1<<i))) 3774 continue; 3775 3776 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3777 alu.op = ALU_OP3_CNDGE_INT; 3778 alu.is_op3 = 1; 3779 alu.dst.write = 1; 3780 3781 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3782 3783 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3784 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3785 alu.src[2].sel = ctx->temp_reg; 3786 alu.src[2].chan = i; 3787 3788 if (i == last_inst) 3789 alu.last = 1; 3790 r = r600_bytecode_add_alu(ctx->bc, &alu); 3791 if (r) 3792 return r; 3793 } 3794 return 0; 3795} 3796 3797static int tgsi_issg(struct r600_shader_ctx *ctx) 3798{ 3799 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3800 struct r600_bytecode_alu alu; 3801 int i, r; 3802 unsigned write_mask = inst->Dst[0].Register.WriteMask; 3803 int last_inst = tgsi_last_instruction(write_mask); 3804 3805 /* tmp = (src >= 0 ? src : -1) */ 3806 for (i = 0; i < 4; i++) { 3807 if (!(write_mask & (1<<i))) 3808 continue; 3809 3810 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3811 alu.op = ALU_OP3_CNDGE_INT; 3812 alu.is_op3 = 1; 3813 3814 alu.dst.sel = ctx->temp_reg; 3815 alu.dst.chan = i; 3816 alu.dst.write = 1; 3817 3818 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3819 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 3820 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 3821 3822 if (i == last_inst) 3823 alu.last = 1; 3824 r = r600_bytecode_add_alu(ctx->bc, &alu); 3825 if (r) 3826 return r; 3827 } 3828 3829 /* dst = (tmp > 0 ? 1 : tmp) */ 3830 for (i = 0; i < 4; i++) { 3831 if (!(write_mask & (1<<i))) 3832 continue; 3833 3834 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3835 alu.op = ALU_OP3_CNDGT_INT; 3836 alu.is_op3 = 1; 3837 alu.dst.write = 1; 3838 3839 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3840 3841 alu.src[0].sel = ctx->temp_reg; 3842 alu.src[0].chan = i; 3843 3844 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 3845 3846 alu.src[2].sel = ctx->temp_reg; 3847 alu.src[2].chan = i; 3848 3849 if (i == last_inst) 3850 alu.last = 1; 3851 r = r600_bytecode_add_alu(ctx->bc, &alu); 3852 if (r) 3853 return r; 3854 } 3855 return 0; 3856} 3857 3858 3859 3860static int tgsi_ssg(struct r600_shader_ctx *ctx) 3861{ 3862 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3863 struct r600_bytecode_alu alu; 3864 int i, r; 3865 3866 /* tmp = (src > 0 ? 1 : src) */ 3867 for (i = 0; i < 4; i++) { 3868 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3869 alu.op = ALU_OP3_CNDGT; 3870 alu.is_op3 = 1; 3871 3872 alu.dst.sel = ctx->temp_reg; 3873 alu.dst.chan = i; 3874 3875 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 3876 alu.src[1].sel = V_SQ_ALU_SRC_1; 3877 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 3878 3879 if (i == 3) 3880 alu.last = 1; 3881 r = r600_bytecode_add_alu(ctx->bc, &alu); 3882 if (r) 3883 return r; 3884 } 3885 3886 /* dst = (-tmp > 0 ? -1 : tmp) */ 3887 for (i = 0; i < 4; i++) { 3888 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3889 alu.op = ALU_OP3_CNDGT; 3890 alu.is_op3 = 1; 3891 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3892 3893 alu.src[0].sel = ctx->temp_reg; 3894 alu.src[0].chan = i; 3895 alu.src[0].neg = 1; 3896 3897 alu.src[1].sel = V_SQ_ALU_SRC_1; 3898 alu.src[1].neg = 1; 3899 3900 alu.src[2].sel = ctx->temp_reg; 3901 alu.src[2].chan = i; 3902 3903 if (i == 3) 3904 alu.last = 1; 3905 r = r600_bytecode_add_alu(ctx->bc, &alu); 3906 if (r) 3907 return r; 3908 } 3909 return 0; 3910} 3911 3912static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 3913{ 3914 struct r600_bytecode_alu alu; 3915 int i, r; 3916 3917 for (i = 0; i < 4; i++) { 3918 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3919 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 3920 alu.op = ALU_OP0_NOP; 3921 alu.dst.chan = i; 3922 } else { 3923 alu.op = ALU_OP1_MOV; 3924 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3925 alu.src[0].sel = ctx->temp_reg; 3926 alu.src[0].chan = i; 3927 } 3928 if (i == 3) { 3929 alu.last = 1; 3930 } 3931 r = r600_bytecode_add_alu(ctx->bc, &alu); 3932 if (r) 3933 return r; 3934 } 3935 return 0; 3936} 3937 3938static int tgsi_op3(struct r600_shader_ctx *ctx) 3939{ 3940 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3941 struct r600_bytecode_alu alu; 3942 int i, j, r; 3943 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 3944 3945 for (i = 0; i < lasti + 1; i++) { 3946 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 3947 continue; 3948 3949 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3950 alu.op = ctx->inst_info->op; 3951 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 3952 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 3953 } 3954 3955 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3956 alu.dst.chan = i; 3957 alu.dst.write = 1; 3958 alu.is_op3 = 1; 3959 if (i == lasti) { 3960 alu.last = 1; 3961 } 3962 r = r600_bytecode_add_alu(ctx->bc, &alu); 3963 if (r) 3964 return r; 3965 } 3966 return 0; 3967} 3968 3969static int tgsi_dp(struct r600_shader_ctx *ctx) 3970{ 3971 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3972 struct r600_bytecode_alu alu; 3973 int i, j, r; 3974 3975 for (i = 0; i < 4; i++) { 3976 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3977 alu.op = ctx->inst_info->op; 3978 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 3979 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 3980 } 3981 3982 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 3983 alu.dst.chan = i; 3984 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 3985 /* handle some special cases */ 3986 switch (ctx->inst_info->tgsi_opcode) { 3987 case TGSI_OPCODE_DP2: 3988 if (i > 1) { 3989 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 3990 alu.src[0].chan = alu.src[1].chan = 0; 3991 } 3992 break; 3993 case TGSI_OPCODE_DP3: 3994 if (i > 2) { 3995 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 3996 alu.src[0].chan = alu.src[1].chan = 0; 3997 } 3998 break; 3999 case TGSI_OPCODE_DPH: 4000 if (i == 3) { 4001 alu.src[0].sel = V_SQ_ALU_SRC_1; 4002 alu.src[0].chan = 0; 4003 alu.src[0].neg = 0; 4004 } 4005 break; 4006 default: 4007 break; 4008 } 4009 if (i == 3) { 4010 alu.last = 1; 4011 } 4012 r = r600_bytecode_add_alu(ctx->bc, &alu); 4013 if (r) 4014 return r; 4015 } 4016 return 0; 4017} 4018 4019static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 4020 unsigned index) 4021{ 4022 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4023 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 4024 inst->Src[index].Register.File != TGSI_FILE_INPUT && 4025 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 4026 ctx->src[index].neg || ctx->src[index].abs; 4027} 4028 4029static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 4030 unsigned index) 4031{ 4032 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4033 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 4034} 4035 4036static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 4037{ 4038 struct r600_bytecode_vtx vtx; 4039 struct r600_bytecode_alu alu; 4040 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4041 int src_gpr, r, i; 4042 int id = tgsi_tex_get_src_gpr(ctx, 1); 4043 4044 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 4045 if (src_requires_loading) { 4046 for (i = 0; i < 4; i++) { 4047 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4048 alu.op = ALU_OP1_MOV; 4049 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4050 alu.dst.sel = ctx->temp_reg; 4051 alu.dst.chan = i; 4052 if (i == 3) 4053 alu.last = 1; 4054 alu.dst.write = 1; 4055 r = r600_bytecode_add_alu(ctx->bc, &alu); 4056 if (r) 4057 return r; 4058 } 4059 src_gpr = ctx->temp_reg; 4060 } 4061 4062 memset(&vtx, 0, sizeof(vtx)); 4063 vtx.op = FETCH_OP_VFETCH; 4064 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 4065 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */ 4066 vtx.src_gpr = src_gpr; 4067 vtx.mega_fetch_count = 16; 4068 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 4069 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 4070 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 4071 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 4072 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 4073 vtx.use_const_fields = 1; 4074 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 4075 4076 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 4077 return r; 4078 4079 if (ctx->bc->chip_class >= EVERGREEN) 4080 return 0; 4081 4082 for (i = 0; i < 4; i++) { 4083 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4084 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4085 continue; 4086 4087 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4088 alu.op = ALU_OP2_AND_INT; 4089 4090 alu.dst.chan = i; 4091 alu.dst.sel = vtx.dst_gpr; 4092 alu.dst.write = 1; 4093 4094 alu.src[0].sel = vtx.dst_gpr; 4095 alu.src[0].chan = i; 4096 4097 alu.src[1].sel = 512 + (id * 2); 4098 alu.src[1].chan = i % 4; 4099 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4100 4101 if (i == lasti) 4102 alu.last = 1; 4103 r = r600_bytecode_add_alu(ctx->bc, &alu); 4104 if (r) 4105 return r; 4106 } 4107 4108 if (inst->Dst[0].Register.WriteMask & 3) { 4109 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4110 alu.op = ALU_OP2_OR_INT; 4111 4112 alu.dst.chan = 3; 4113 alu.dst.sel = vtx.dst_gpr; 4114 alu.dst.write = 1; 4115 4116 alu.src[0].sel = vtx.dst_gpr; 4117 alu.src[0].chan = 3; 4118 4119 alu.src[1].sel = 512 + (id * 2) + 1; 4120 alu.src[1].chan = 0; 4121 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4122 4123 alu.last = 1; 4124 r = r600_bytecode_add_alu(ctx->bc, &alu); 4125 if (r) 4126 return r; 4127 } 4128 return 0; 4129} 4130 4131static int r600_do_buffer_txq(struct r600_shader_ctx *ctx) 4132{ 4133 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4134 struct r600_bytecode_alu alu; 4135 int r; 4136 int id = tgsi_tex_get_src_gpr(ctx, 1); 4137 4138 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4139 alu.op = ALU_OP1_MOV; 4140 4141 if (ctx->bc->chip_class >= EVERGREEN) { 4142 alu.src[0].sel = 512 + (id / 4); 4143 alu.src[0].chan = id % 4; 4144 } else { 4145 /* r600 we have them at channel 2 of the second dword */ 4146 alu.src[0].sel = 512 + (id * 2) + 1; 4147 alu.src[0].chan = 1; 4148 } 4149 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 4150 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 4151 alu.last = 1; 4152 r = r600_bytecode_add_alu(ctx->bc, &alu); 4153 if (r) 4154 return r; 4155 return 0; 4156} 4157 4158static int tgsi_tex(struct r600_shader_ctx *ctx) 4159{ 4160 static float one_point_five = 1.5f; 4161 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4162 struct r600_bytecode_tex tex; 4163 struct r600_bytecode_alu alu; 4164 unsigned src_gpr; 4165 int r, i, j; 4166 int opcode; 4167 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing && 4168 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 4169 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 4170 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 4171 4172 /* Texture fetch instructions can only use gprs as source. 4173 * Also they cannot negate the source or take the absolute value */ 4174 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ && 4175 tgsi_tex_src_requires_loading(ctx, 0)) || 4176 read_compressed_msaa; 4177 boolean src_loaded = FALSE; 4178 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1; 4179 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 4180 boolean has_txq_cube_array_z = false; 4181 4182 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 4183 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4184 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 4185 if (inst->Dst[0].Register.WriteMask & 4) { 4186 ctx->shader->has_txq_cube_array_z_comp = true; 4187 has_txq_cube_array_z = true; 4188 } 4189 4190 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 4191 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 4192 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 4193 sampler_src_reg = 2; 4194 4195 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 4196 4197 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 4198 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 4199 ctx->shader->uses_tex_buffers = true; 4200 return r600_do_buffer_txq(ctx); 4201 } 4202 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 4203 if (ctx->bc->chip_class < EVERGREEN) 4204 ctx->shader->uses_tex_buffers = true; 4205 return do_vtx_fetch_inst(ctx, src_requires_loading); 4206 } 4207 } 4208 4209 /* get offset values */ 4210 if (inst->Texture.NumOffsets) { 4211 assert(inst->Texture.NumOffsets == 1); 4212 4213 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 4214 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 4215 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 4216 } 4217 4218 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 4219 /* TGSI moves the sampler to src reg 3 for TXD */ 4220 sampler_src_reg = 3; 4221 4222 for (i = 1; i < 3; i++) { 4223 /* set gradients h/v */ 4224 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4225 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 4226 FETCH_OP_SET_GRADIENTS_V; 4227 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4228 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4229 4230 if (tgsi_tex_src_requires_loading(ctx, i)) { 4231 tex.src_gpr = r600_get_temp(ctx); 4232 tex.src_sel_x = 0; 4233 tex.src_sel_y = 1; 4234 tex.src_sel_z = 2; 4235 tex.src_sel_w = 3; 4236 4237 for (j = 0; j < 4; j++) { 4238 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4239 alu.op = ALU_OP1_MOV; 4240 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 4241 alu.dst.sel = tex.src_gpr; 4242 alu.dst.chan = j; 4243 if (j == 3) 4244 alu.last = 1; 4245 alu.dst.write = 1; 4246 r = r600_bytecode_add_alu(ctx->bc, &alu); 4247 if (r) 4248 return r; 4249 } 4250 4251 } else { 4252 tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i); 4253 tex.src_sel_x = ctx->src[i].swizzle[0]; 4254 tex.src_sel_y = ctx->src[i].swizzle[1]; 4255 tex.src_sel_z = ctx->src[i].swizzle[2]; 4256 tex.src_sel_w = ctx->src[i].swizzle[3]; 4257 tex.src_rel = ctx->src[i].rel; 4258 } 4259 tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */ 4260 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 4261 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 4262 tex.coord_type_x = 1; 4263 tex.coord_type_y = 1; 4264 tex.coord_type_z = 1; 4265 tex.coord_type_w = 1; 4266 } 4267 r = r600_bytecode_add_tex(ctx->bc, &tex); 4268 if (r) 4269 return r; 4270 } 4271 } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 4272 int out_chan; 4273 /* Add perspective divide */ 4274 if (ctx->bc->chip_class == CAYMAN) { 4275 out_chan = 2; 4276 for (i = 0; i < 3; i++) { 4277 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4278 alu.op = ALU_OP1_RECIP_IEEE; 4279 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4280 4281 alu.dst.sel = ctx->temp_reg; 4282 alu.dst.chan = i; 4283 if (i == 2) 4284 alu.last = 1; 4285 if (out_chan == i) 4286 alu.dst.write = 1; 4287 r = r600_bytecode_add_alu(ctx->bc, &alu); 4288 if (r) 4289 return r; 4290 } 4291 4292 } else { 4293 out_chan = 3; 4294 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4295 alu.op = ALU_OP1_RECIP_IEEE; 4296 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4297 4298 alu.dst.sel = ctx->temp_reg; 4299 alu.dst.chan = out_chan; 4300 alu.last = 1; 4301 alu.dst.write = 1; 4302 r = r600_bytecode_add_alu(ctx->bc, &alu); 4303 if (r) 4304 return r; 4305 } 4306 4307 for (i = 0; i < 3; i++) { 4308 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4309 alu.op = ALU_OP2_MUL; 4310 alu.src[0].sel = ctx->temp_reg; 4311 alu.src[0].chan = out_chan; 4312 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4313 alu.dst.sel = ctx->temp_reg; 4314 alu.dst.chan = i; 4315 alu.dst.write = 1; 4316 r = r600_bytecode_add_alu(ctx->bc, &alu); 4317 if (r) 4318 return r; 4319 } 4320 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4321 alu.op = ALU_OP1_MOV; 4322 alu.src[0].sel = V_SQ_ALU_SRC_1; 4323 alu.src[0].chan = 0; 4324 alu.dst.sel = ctx->temp_reg; 4325 alu.dst.chan = 3; 4326 alu.last = 1; 4327 alu.dst.write = 1; 4328 r = r600_bytecode_add_alu(ctx->bc, &alu); 4329 if (r) 4330 return r; 4331 src_loaded = TRUE; 4332 src_gpr = ctx->temp_reg; 4333 } 4334 4335 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 4336 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4337 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4338 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 4339 inst->Instruction.Opcode != TGSI_OPCODE_TXQ && 4340 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { 4341 4342 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 4343 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 4344 4345 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 4346 for (i = 0; i < 4; i++) { 4347 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4348 alu.op = ALU_OP2_CUBE; 4349 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 4350 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 4351 alu.dst.sel = ctx->temp_reg; 4352 alu.dst.chan = i; 4353 if (i == 3) 4354 alu.last = 1; 4355 alu.dst.write = 1; 4356 r = r600_bytecode_add_alu(ctx->bc, &alu); 4357 if (r) 4358 return r; 4359 } 4360 4361 /* tmp1.z = RCP_e(|tmp1.z|) */ 4362 if (ctx->bc->chip_class == CAYMAN) { 4363 for (i = 0; i < 3; i++) { 4364 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4365 alu.op = ALU_OP1_RECIP_IEEE; 4366 alu.src[0].sel = ctx->temp_reg; 4367 alu.src[0].chan = 2; 4368 alu.src[0].abs = 1; 4369 alu.dst.sel = ctx->temp_reg; 4370 alu.dst.chan = i; 4371 if (i == 2) 4372 alu.dst.write = 1; 4373 if (i == 2) 4374 alu.last = 1; 4375 r = r600_bytecode_add_alu(ctx->bc, &alu); 4376 if (r) 4377 return r; 4378 } 4379 } else { 4380 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4381 alu.op = ALU_OP1_RECIP_IEEE; 4382 alu.src[0].sel = ctx->temp_reg; 4383 alu.src[0].chan = 2; 4384 alu.src[0].abs = 1; 4385 alu.dst.sel = ctx->temp_reg; 4386 alu.dst.chan = 2; 4387 alu.dst.write = 1; 4388 alu.last = 1; 4389 r = r600_bytecode_add_alu(ctx->bc, &alu); 4390 if (r) 4391 return r; 4392 } 4393 4394 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 4395 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 4396 * muladd has no writemask, have to use another temp 4397 */ 4398 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4399 alu.op = ALU_OP3_MULADD; 4400 alu.is_op3 = 1; 4401 4402 alu.src[0].sel = ctx->temp_reg; 4403 alu.src[0].chan = 0; 4404 alu.src[1].sel = ctx->temp_reg; 4405 alu.src[1].chan = 2; 4406 4407 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 4408 alu.src[2].chan = 0; 4409 alu.src[2].value = *(uint32_t *)&one_point_five; 4410 4411 alu.dst.sel = ctx->temp_reg; 4412 alu.dst.chan = 0; 4413 alu.dst.write = 1; 4414 4415 r = r600_bytecode_add_alu(ctx->bc, &alu); 4416 if (r) 4417 return r; 4418 4419 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4420 alu.op = ALU_OP3_MULADD; 4421 alu.is_op3 = 1; 4422 4423 alu.src[0].sel = ctx->temp_reg; 4424 alu.src[0].chan = 1; 4425 alu.src[1].sel = ctx->temp_reg; 4426 alu.src[1].chan = 2; 4427 4428 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 4429 alu.src[2].chan = 0; 4430 alu.src[2].value = *(uint32_t *)&one_point_five; 4431 4432 alu.dst.sel = ctx->temp_reg; 4433 alu.dst.chan = 1; 4434 alu.dst.write = 1; 4435 4436 alu.last = 1; 4437 r = r600_bytecode_add_alu(ctx->bc, &alu); 4438 if (r) 4439 return r; 4440 /* write initial compare value into Z component 4441 - W src 0 for shadow cube 4442 - X src 1 for shadow cube array */ 4443 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4444 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4445 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4446 alu.op = ALU_OP1_MOV; 4447 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 4448 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 4449 else 4450 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4451 alu.dst.sel = ctx->temp_reg; 4452 alu.dst.chan = 2; 4453 alu.dst.write = 1; 4454 alu.last = 1; 4455 r = r600_bytecode_add_alu(ctx->bc, &alu); 4456 if (r) 4457 return r; 4458 } 4459 4460 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4461 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4462 if (ctx->bc->chip_class >= EVERGREEN) { 4463 int mytmp = r600_get_temp(ctx); 4464 static const float eight = 8.0f; 4465 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4466 alu.op = ALU_OP1_MOV; 4467 alu.src[0].sel = ctx->temp_reg; 4468 alu.src[0].chan = 3; 4469 alu.dst.sel = mytmp; 4470 alu.dst.chan = 0; 4471 alu.dst.write = 1; 4472 alu.last = 1; 4473 r = r600_bytecode_add_alu(ctx->bc, &alu); 4474 if (r) 4475 return r; 4476 4477 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 4478 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4479 alu.op = ALU_OP3_MULADD; 4480 alu.is_op3 = 1; 4481 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4482 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4483 alu.src[1].chan = 0; 4484 alu.src[1].value = *(uint32_t *)&eight; 4485 alu.src[2].sel = mytmp; 4486 alu.src[2].chan = 0; 4487 alu.dst.sel = ctx->temp_reg; 4488 alu.dst.chan = 3; 4489 alu.dst.write = 1; 4490 alu.last = 1; 4491 r = r600_bytecode_add_alu(ctx->bc, &alu); 4492 if (r) 4493 return r; 4494 } else if (ctx->bc->chip_class < EVERGREEN) { 4495 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4496 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 4497 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4498 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4499 tex.src_gpr = r600_get_temp(ctx); 4500 tex.src_sel_x = 0; 4501 tex.src_sel_y = 0; 4502 tex.src_sel_z = 0; 4503 tex.src_sel_w = 0; 4504 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 4505 tex.coord_type_x = 1; 4506 tex.coord_type_y = 1; 4507 tex.coord_type_z = 1; 4508 tex.coord_type_w = 1; 4509 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4510 alu.op = ALU_OP1_MOV; 4511 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4512 alu.dst.sel = tex.src_gpr; 4513 alu.dst.chan = 0; 4514 alu.last = 1; 4515 alu.dst.write = 1; 4516 r = r600_bytecode_add_alu(ctx->bc, &alu); 4517 if (r) 4518 return r; 4519 4520 r = r600_bytecode_add_tex(ctx->bc, &tex); 4521 if (r) 4522 return r; 4523 } 4524 4525 } 4526 4527 /* for cube forms of lod and bias we need to route things */ 4528 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 4529 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 4530 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 4531 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 4532 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4533 alu.op = ALU_OP1_MOV; 4534 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 4535 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 4536 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 4537 else 4538 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 4539 alu.dst.sel = ctx->temp_reg; 4540 alu.dst.chan = 2; 4541 alu.last = 1; 4542 alu.dst.write = 1; 4543 r = r600_bytecode_add_alu(ctx->bc, &alu); 4544 if (r) 4545 return r; 4546 } 4547 4548 src_loaded = TRUE; 4549 src_gpr = ctx->temp_reg; 4550 } 4551 4552 if (src_requires_loading && !src_loaded) { 4553 for (i = 0; i < 4; i++) { 4554 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4555 alu.op = ALU_OP1_MOV; 4556 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4557 alu.dst.sel = ctx->temp_reg; 4558 alu.dst.chan = i; 4559 if (i == 3) 4560 alu.last = 1; 4561 alu.dst.write = 1; 4562 r = r600_bytecode_add_alu(ctx->bc, &alu); 4563 if (r) 4564 return r; 4565 } 4566 src_loaded = TRUE; 4567 src_gpr = ctx->temp_reg; 4568 } 4569 4570 /* Obtain the sample index for reading a compressed MSAA color texture. 4571 * To read the FMASK, we use the ldfptr instruction, which tells us 4572 * where the samples are stored. 4573 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 4574 * which is the identity mapping. Each nibble says which physical sample 4575 * should be fetched to get that sample. 4576 * 4577 * Assume src.z contains the sample index. It should be modified like this: 4578 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 4579 * Then fetch the texel with src. 4580 */ 4581 if (read_compressed_msaa) { 4582 unsigned sample_chan = 3; 4583 unsigned temp = r600_get_temp(ctx); 4584 assert(src_loaded); 4585 4586 /* temp.w = ldfptr() */ 4587 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4588 tex.op = FETCH_OP_LD; 4589 tex.inst_mod = 1; /* to indicate this is ldfptr */ 4590 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4591 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4592 tex.src_gpr = src_gpr; 4593 tex.dst_gpr = temp; 4594 tex.dst_sel_x = 7; /* mask out these components */ 4595 tex.dst_sel_y = 7; 4596 tex.dst_sel_z = 7; 4597 tex.dst_sel_w = 0; /* store X */ 4598 tex.src_sel_x = 0; 4599 tex.src_sel_y = 1; 4600 tex.src_sel_z = 2; 4601 tex.src_sel_w = 3; 4602 tex.offset_x = offset_x; 4603 tex.offset_y = offset_y; 4604 tex.offset_z = offset_z; 4605 r = r600_bytecode_add_tex(ctx->bc, &tex); 4606 if (r) 4607 return r; 4608 4609 /* temp.x = sample_index*4 */ 4610 if (ctx->bc->chip_class == CAYMAN) { 4611 for (i = 0 ; i < 4; i++) { 4612 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4613 alu.op = ALU_OP2_MULLO_INT; 4614 alu.src[0].sel = src_gpr; 4615 alu.src[0].chan = sample_chan; 4616 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4617 alu.src[1].value = 4; 4618 alu.dst.sel = temp; 4619 alu.dst.chan = i; 4620 alu.dst.write = i == 0; 4621 if (i == 3) 4622 alu.last = 1; 4623 r = r600_bytecode_add_alu(ctx->bc, &alu); 4624 if (r) 4625 return r; 4626 } 4627 } else { 4628 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4629 alu.op = ALU_OP2_MULLO_INT; 4630 alu.src[0].sel = src_gpr; 4631 alu.src[0].chan = sample_chan; 4632 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4633 alu.src[1].value = 4; 4634 alu.dst.sel = temp; 4635 alu.dst.chan = 0; 4636 alu.dst.write = 1; 4637 alu.last = 1; 4638 r = r600_bytecode_add_alu(ctx->bc, &alu); 4639 if (r) 4640 return r; 4641 } 4642 4643 /* sample_index = temp.w >> temp.x */ 4644 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4645 alu.op = ALU_OP2_LSHR_INT; 4646 alu.src[0].sel = temp; 4647 alu.src[0].chan = 3; 4648 alu.src[1].sel = temp; 4649 alu.src[1].chan = 0; 4650 alu.dst.sel = src_gpr; 4651 alu.dst.chan = sample_chan; 4652 alu.dst.write = 1; 4653 alu.last = 1; 4654 r = r600_bytecode_add_alu(ctx->bc, &alu); 4655 if (r) 4656 return r; 4657 4658 /* sample_index & 0xF */ 4659 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4660 alu.op = ALU_OP2_AND_INT; 4661 alu.src[0].sel = src_gpr; 4662 alu.src[0].chan = sample_chan; 4663 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4664 alu.src[1].value = 0xF; 4665 alu.dst.sel = src_gpr; 4666 alu.dst.chan = sample_chan; 4667 alu.dst.write = 1; 4668 alu.last = 1; 4669 r = r600_bytecode_add_alu(ctx->bc, &alu); 4670 if (r) 4671 return r; 4672#if 0 4673 /* visualize the FMASK */ 4674 for (i = 0; i < 4; i++) { 4675 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4676 alu.op = ALU_OP1_INT_TO_FLT; 4677 alu.src[0].sel = src_gpr; 4678 alu.src[0].chan = sample_chan; 4679 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 4680 alu.dst.chan = i; 4681 alu.dst.write = 1; 4682 alu.last = 1; 4683 r = r600_bytecode_add_alu(ctx->bc, &alu); 4684 if (r) 4685 return r; 4686 } 4687 return 0; 4688#endif 4689 } 4690 4691 /* does this shader want a num layers from TXQ for a cube array? */ 4692 if (has_txq_cube_array_z) { 4693 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4694 4695 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4696 alu.op = ALU_OP1_MOV; 4697 4698 alu.src[0].sel = 512 + (id / 4); 4699 alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER; 4700 alu.src[0].chan = id % 4; 4701 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 4702 alu.last = 1; 4703 r = r600_bytecode_add_alu(ctx->bc, &alu); 4704 if (r) 4705 return r; 4706 /* disable writemask from texture instruction */ 4707 inst->Dst[0].Register.WriteMask &= ~4; 4708 } 4709 4710 opcode = ctx->inst_info->op; 4711 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 4712 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 4713 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 4714 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4715 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 4716 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 4717 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4718 switch (opcode) { 4719 case FETCH_OP_SAMPLE: 4720 opcode = FETCH_OP_SAMPLE_C; 4721 break; 4722 case FETCH_OP_SAMPLE_L: 4723 opcode = FETCH_OP_SAMPLE_C_L; 4724 break; 4725 case FETCH_OP_SAMPLE_LB: 4726 opcode = FETCH_OP_SAMPLE_C_LB; 4727 break; 4728 case FETCH_OP_SAMPLE_G: 4729 opcode = FETCH_OP_SAMPLE_C_G; 4730 break; 4731 } 4732 } 4733 4734 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 4735 tex.op = opcode; 4736 4737 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 4738 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 4739 tex.src_gpr = src_gpr; 4740 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 4741 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 4742 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 4743 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 4744 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 4745 4746 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) { 4747 tex.src_sel_x = 4; 4748 tex.src_sel_y = 4; 4749 tex.src_sel_z = 4; 4750 tex.src_sel_w = 4; 4751 } else if (src_loaded) { 4752 tex.src_sel_x = 0; 4753 tex.src_sel_y = 1; 4754 tex.src_sel_z = 2; 4755 tex.src_sel_w = 3; 4756 } else { 4757 tex.src_sel_x = ctx->src[0].swizzle[0]; 4758 tex.src_sel_y = ctx->src[0].swizzle[1]; 4759 tex.src_sel_z = ctx->src[0].swizzle[2]; 4760 tex.src_sel_w = ctx->src[0].swizzle[3]; 4761 tex.src_rel = ctx->src[0].rel; 4762 } 4763 4764 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 4765 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 4766 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4767 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 4768 tex.src_sel_x = 1; 4769 tex.src_sel_y = 0; 4770 tex.src_sel_z = 3; 4771 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 4772 } 4773 4774 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 4775 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 4776 tex.coord_type_x = 1; 4777 tex.coord_type_y = 1; 4778 } 4779 tex.coord_type_z = 1; 4780 tex.coord_type_w = 1; 4781 4782 tex.offset_x = offset_x; 4783 tex.offset_y = offset_y; 4784 tex.offset_z = offset_z; 4785 4786 /* Put the depth for comparison in W. 4787 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 4788 * Some instructions expect the depth in Z. */ 4789 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 4790 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 4791 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 4792 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 4793 opcode != FETCH_OP_SAMPLE_C_L && 4794 opcode != FETCH_OP_SAMPLE_C_LB) { 4795 tex.src_sel_w = tex.src_sel_z; 4796 } 4797 4798 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 4799 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 4800 if (opcode == FETCH_OP_SAMPLE_C_L || 4801 opcode == FETCH_OP_SAMPLE_C_LB) { 4802 /* the array index is read from Y */ 4803 tex.coord_type_y = 0; 4804 } else { 4805 /* the array index is read from Z */ 4806 tex.coord_type_z = 0; 4807 tex.src_sel_z = tex.src_sel_y; 4808 } 4809 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 4810 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 4811 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 4812 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 4813 (ctx->bc->chip_class >= EVERGREEN))) 4814 /* the array index is read from Z */ 4815 tex.coord_type_z = 0; 4816 4817 /* mask unused source components */ 4818 if (opcode == FETCH_OP_SAMPLE) { 4819 switch (inst->Texture.Texture) { 4820 case TGSI_TEXTURE_2D: 4821 case TGSI_TEXTURE_RECT: 4822 tex.src_sel_z = 7; 4823 tex.src_sel_w = 7; 4824 break; 4825 case TGSI_TEXTURE_1D_ARRAY: 4826 tex.src_sel_y = 7; 4827 tex.src_sel_w = 7; 4828 break; 4829 case TGSI_TEXTURE_1D: 4830 tex.src_sel_y = 7; 4831 tex.src_sel_z = 7; 4832 tex.src_sel_w = 7; 4833 break; 4834 } 4835 } 4836 4837 r = r600_bytecode_add_tex(ctx->bc, &tex); 4838 if (r) 4839 return r; 4840 4841 /* add shadow ambient support - gallium doesn't do it yet */ 4842 return 0; 4843} 4844 4845static int tgsi_lrp(struct r600_shader_ctx *ctx) 4846{ 4847 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4848 struct r600_bytecode_alu alu; 4849 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4850 unsigned i; 4851 int r; 4852 4853 /* optimize if it's just an equal balance */ 4854 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 4855 for (i = 0; i < lasti + 1; i++) { 4856 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4857 continue; 4858 4859 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4860 alu.op = ALU_OP2_ADD; 4861 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4862 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4863 alu.omod = 3; 4864 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4865 alu.dst.chan = i; 4866 if (i == lasti) { 4867 alu.last = 1; 4868 } 4869 r = r600_bytecode_add_alu(ctx->bc, &alu); 4870 if (r) 4871 return r; 4872 } 4873 return 0; 4874 } 4875 4876 /* 1 - src0 */ 4877 for (i = 0; i < lasti + 1; i++) { 4878 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4879 continue; 4880 4881 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4882 alu.op = ALU_OP2_ADD; 4883 alu.src[0].sel = V_SQ_ALU_SRC_1; 4884 alu.src[0].chan = 0; 4885 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4886 r600_bytecode_src_toggle_neg(&alu.src[1]); 4887 alu.dst.sel = ctx->temp_reg; 4888 alu.dst.chan = i; 4889 if (i == lasti) { 4890 alu.last = 1; 4891 } 4892 alu.dst.write = 1; 4893 r = r600_bytecode_add_alu(ctx->bc, &alu); 4894 if (r) 4895 return r; 4896 } 4897 4898 /* (1 - src0) * src2 */ 4899 for (i = 0; i < lasti + 1; i++) { 4900 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4901 continue; 4902 4903 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4904 alu.op = ALU_OP2_MUL; 4905 alu.src[0].sel = ctx->temp_reg; 4906 alu.src[0].chan = i; 4907 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4908 alu.dst.sel = ctx->temp_reg; 4909 alu.dst.chan = i; 4910 if (i == lasti) { 4911 alu.last = 1; 4912 } 4913 alu.dst.write = 1; 4914 r = r600_bytecode_add_alu(ctx->bc, &alu); 4915 if (r) 4916 return r; 4917 } 4918 4919 /* src0 * src1 + (1 - src0) * src2 */ 4920 for (i = 0; i < lasti + 1; i++) { 4921 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4922 continue; 4923 4924 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4925 alu.op = ALU_OP3_MULADD; 4926 alu.is_op3 = 1; 4927 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4928 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 4929 alu.src[2].sel = ctx->temp_reg; 4930 alu.src[2].chan = i; 4931 4932 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4933 alu.dst.chan = i; 4934 if (i == lasti) { 4935 alu.last = 1; 4936 } 4937 r = r600_bytecode_add_alu(ctx->bc, &alu); 4938 if (r) 4939 return r; 4940 } 4941 return 0; 4942} 4943 4944static int tgsi_cmp(struct r600_shader_ctx *ctx) 4945{ 4946 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4947 struct r600_bytecode_alu alu; 4948 int i, r; 4949 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4950 4951 for (i = 0; i < lasti + 1; i++) { 4952 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4953 continue; 4954 4955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4956 alu.op = ALU_OP3_CNDGE; 4957 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4958 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4959 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 4960 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4961 alu.dst.chan = i; 4962 alu.dst.write = 1; 4963 alu.is_op3 = 1; 4964 if (i == lasti) 4965 alu.last = 1; 4966 r = r600_bytecode_add_alu(ctx->bc, &alu); 4967 if (r) 4968 return r; 4969 } 4970 return 0; 4971} 4972 4973static int tgsi_ucmp(struct r600_shader_ctx *ctx) 4974{ 4975 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4976 struct r600_bytecode_alu alu; 4977 int i, r; 4978 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4979 4980 for (i = 0; i < lasti + 1; i++) { 4981 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4982 continue; 4983 4984 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4985 alu.op = ALU_OP3_CNDGE_INT; 4986 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4987 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 4988 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 4989 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4990 alu.dst.chan = i; 4991 alu.dst.write = 1; 4992 alu.is_op3 = 1; 4993 if (i == lasti) 4994 alu.last = 1; 4995 r = r600_bytecode_add_alu(ctx->bc, &alu); 4996 if (r) 4997 return r; 4998 } 4999 return 0; 5000} 5001 5002static int tgsi_xpd(struct r600_shader_ctx *ctx) 5003{ 5004 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5005 static const unsigned int src0_swizzle[] = {2, 0, 1}; 5006 static const unsigned int src1_swizzle[] = {1, 2, 0}; 5007 struct r600_bytecode_alu alu; 5008 uint32_t use_temp = 0; 5009 int i, r; 5010 5011 if (inst->Dst[0].Register.WriteMask != 0xf) 5012 use_temp = 1; 5013 5014 for (i = 0; i < 4; i++) { 5015 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5016 alu.op = ALU_OP2_MUL; 5017 if (i < 3) { 5018 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 5019 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]); 5020 } else { 5021 alu.src[0].sel = V_SQ_ALU_SRC_0; 5022 alu.src[0].chan = i; 5023 alu.src[1].sel = V_SQ_ALU_SRC_0; 5024 alu.src[1].chan = i; 5025 } 5026 5027 alu.dst.sel = ctx->temp_reg; 5028 alu.dst.chan = i; 5029 alu.dst.write = 1; 5030 5031 if (i == 3) 5032 alu.last = 1; 5033 r = r600_bytecode_add_alu(ctx->bc, &alu); 5034 if (r) 5035 return r; 5036 } 5037 5038 for (i = 0; i < 4; i++) { 5039 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5040 alu.op = ALU_OP3_MULADD; 5041 5042 if (i < 3) { 5043 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]); 5044 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]); 5045 } else { 5046 alu.src[0].sel = V_SQ_ALU_SRC_0; 5047 alu.src[0].chan = i; 5048 alu.src[1].sel = V_SQ_ALU_SRC_0; 5049 alu.src[1].chan = i; 5050 } 5051 5052 alu.src[2].sel = ctx->temp_reg; 5053 alu.src[2].neg = 1; 5054 alu.src[2].chan = i; 5055 5056 if (use_temp) 5057 alu.dst.sel = ctx->temp_reg; 5058 else 5059 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5060 alu.dst.chan = i; 5061 alu.dst.write = 1; 5062 alu.is_op3 = 1; 5063 if (i == 3) 5064 alu.last = 1; 5065 r = r600_bytecode_add_alu(ctx->bc, &alu); 5066 if (r) 5067 return r; 5068 } 5069 if (use_temp) 5070 return tgsi_helper_copy(ctx, inst); 5071 return 0; 5072} 5073 5074static int tgsi_exp(struct r600_shader_ctx *ctx) 5075{ 5076 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5077 struct r600_bytecode_alu alu; 5078 int r; 5079 int i; 5080 5081 /* result.x = 2^floor(src); */ 5082 if (inst->Dst[0].Register.WriteMask & 1) { 5083 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5084 5085 alu.op = ALU_OP1_FLOOR; 5086 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5087 5088 alu.dst.sel = ctx->temp_reg; 5089 alu.dst.chan = 0; 5090 alu.dst.write = 1; 5091 alu.last = 1; 5092 r = r600_bytecode_add_alu(ctx->bc, &alu); 5093 if (r) 5094 return r; 5095 5096 if (ctx->bc->chip_class == CAYMAN) { 5097 for (i = 0; i < 3; i++) { 5098 alu.op = ALU_OP1_EXP_IEEE; 5099 alu.src[0].sel = ctx->temp_reg; 5100 alu.src[0].chan = 0; 5101 5102 alu.dst.sel = ctx->temp_reg; 5103 alu.dst.chan = i; 5104 alu.dst.write = i == 0; 5105 alu.last = i == 2; 5106 r = r600_bytecode_add_alu(ctx->bc, &alu); 5107 if (r) 5108 return r; 5109 } 5110 } else { 5111 alu.op = ALU_OP1_EXP_IEEE; 5112 alu.src[0].sel = ctx->temp_reg; 5113 alu.src[0].chan = 0; 5114 5115 alu.dst.sel = ctx->temp_reg; 5116 alu.dst.chan = 0; 5117 alu.dst.write = 1; 5118 alu.last = 1; 5119 r = r600_bytecode_add_alu(ctx->bc, &alu); 5120 if (r) 5121 return r; 5122 } 5123 } 5124 5125 /* result.y = tmp - floor(tmp); */ 5126 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 5127 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5128 5129 alu.op = ALU_OP1_FRACT; 5130 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5131 5132 alu.dst.sel = ctx->temp_reg; 5133#if 0 5134 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5135 if (r) 5136 return r; 5137#endif 5138 alu.dst.write = 1; 5139 alu.dst.chan = 1; 5140 5141 alu.last = 1; 5142 5143 r = r600_bytecode_add_alu(ctx->bc, &alu); 5144 if (r) 5145 return r; 5146 } 5147 5148 /* result.z = RoughApprox2ToX(tmp);*/ 5149 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 5150 if (ctx->bc->chip_class == CAYMAN) { 5151 for (i = 0; i < 3; i++) { 5152 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5153 alu.op = ALU_OP1_EXP_IEEE; 5154 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5155 5156 alu.dst.sel = ctx->temp_reg; 5157 alu.dst.chan = i; 5158 if (i == 2) { 5159 alu.dst.write = 1; 5160 alu.last = 1; 5161 } 5162 5163 r = r600_bytecode_add_alu(ctx->bc, &alu); 5164 if (r) 5165 return r; 5166 } 5167 } else { 5168 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5169 alu.op = ALU_OP1_EXP_IEEE; 5170 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5171 5172 alu.dst.sel = ctx->temp_reg; 5173 alu.dst.write = 1; 5174 alu.dst.chan = 2; 5175 5176 alu.last = 1; 5177 5178 r = r600_bytecode_add_alu(ctx->bc, &alu); 5179 if (r) 5180 return r; 5181 } 5182 } 5183 5184 /* result.w = 1.0;*/ 5185 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 5186 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5187 5188 alu.op = ALU_OP1_MOV; 5189 alu.src[0].sel = V_SQ_ALU_SRC_1; 5190 alu.src[0].chan = 0; 5191 5192 alu.dst.sel = ctx->temp_reg; 5193 alu.dst.chan = 3; 5194 alu.dst.write = 1; 5195 alu.last = 1; 5196 r = r600_bytecode_add_alu(ctx->bc, &alu); 5197 if (r) 5198 return r; 5199 } 5200 return tgsi_helper_copy(ctx, inst); 5201} 5202 5203static int tgsi_log(struct r600_shader_ctx *ctx) 5204{ 5205 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5206 struct r600_bytecode_alu alu; 5207 int r; 5208 int i; 5209 5210 /* result.x = floor(log2(|src|)); */ 5211 if (inst->Dst[0].Register.WriteMask & 1) { 5212 if (ctx->bc->chip_class == CAYMAN) { 5213 for (i = 0; i < 3; i++) { 5214 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5215 5216 alu.op = ALU_OP1_LOG_IEEE; 5217 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5218 r600_bytecode_src_set_abs(&alu.src[0]); 5219 5220 alu.dst.sel = ctx->temp_reg; 5221 alu.dst.chan = i; 5222 if (i == 0) 5223 alu.dst.write = 1; 5224 if (i == 2) 5225 alu.last = 1; 5226 r = r600_bytecode_add_alu(ctx->bc, &alu); 5227 if (r) 5228 return r; 5229 } 5230 5231 } else { 5232 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5233 5234 alu.op = ALU_OP1_LOG_IEEE; 5235 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5236 r600_bytecode_src_set_abs(&alu.src[0]); 5237 5238 alu.dst.sel = ctx->temp_reg; 5239 alu.dst.chan = 0; 5240 alu.dst.write = 1; 5241 alu.last = 1; 5242 r = r600_bytecode_add_alu(ctx->bc, &alu); 5243 if (r) 5244 return r; 5245 } 5246 5247 alu.op = ALU_OP1_FLOOR; 5248 alu.src[0].sel = ctx->temp_reg; 5249 alu.src[0].chan = 0; 5250 5251 alu.dst.sel = ctx->temp_reg; 5252 alu.dst.chan = 0; 5253 alu.dst.write = 1; 5254 alu.last = 1; 5255 5256 r = r600_bytecode_add_alu(ctx->bc, &alu); 5257 if (r) 5258 return r; 5259 } 5260 5261 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 5262 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 5263 5264 if (ctx->bc->chip_class == CAYMAN) { 5265 for (i = 0; i < 3; i++) { 5266 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5267 5268 alu.op = ALU_OP1_LOG_IEEE; 5269 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5270 r600_bytecode_src_set_abs(&alu.src[0]); 5271 5272 alu.dst.sel = ctx->temp_reg; 5273 alu.dst.chan = i; 5274 if (i == 1) 5275 alu.dst.write = 1; 5276 if (i == 2) 5277 alu.last = 1; 5278 5279 r = r600_bytecode_add_alu(ctx->bc, &alu); 5280 if (r) 5281 return r; 5282 } 5283 } else { 5284 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5285 5286 alu.op = ALU_OP1_LOG_IEEE; 5287 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5288 r600_bytecode_src_set_abs(&alu.src[0]); 5289 5290 alu.dst.sel = ctx->temp_reg; 5291 alu.dst.chan = 1; 5292 alu.dst.write = 1; 5293 alu.last = 1; 5294 5295 r = r600_bytecode_add_alu(ctx->bc, &alu); 5296 if (r) 5297 return r; 5298 } 5299 5300 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5301 5302 alu.op = ALU_OP1_FLOOR; 5303 alu.src[0].sel = ctx->temp_reg; 5304 alu.src[0].chan = 1; 5305 5306 alu.dst.sel = ctx->temp_reg; 5307 alu.dst.chan = 1; 5308 alu.dst.write = 1; 5309 alu.last = 1; 5310 5311 r = r600_bytecode_add_alu(ctx->bc, &alu); 5312 if (r) 5313 return r; 5314 5315 if (ctx->bc->chip_class == CAYMAN) { 5316 for (i = 0; i < 3; i++) { 5317 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5318 alu.op = ALU_OP1_EXP_IEEE; 5319 alu.src[0].sel = ctx->temp_reg; 5320 alu.src[0].chan = 1; 5321 5322 alu.dst.sel = ctx->temp_reg; 5323 alu.dst.chan = i; 5324 if (i == 1) 5325 alu.dst.write = 1; 5326 if (i == 2) 5327 alu.last = 1; 5328 5329 r = r600_bytecode_add_alu(ctx->bc, &alu); 5330 if (r) 5331 return r; 5332 } 5333 } else { 5334 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5335 alu.op = ALU_OP1_EXP_IEEE; 5336 alu.src[0].sel = ctx->temp_reg; 5337 alu.src[0].chan = 1; 5338 5339 alu.dst.sel = ctx->temp_reg; 5340 alu.dst.chan = 1; 5341 alu.dst.write = 1; 5342 alu.last = 1; 5343 5344 r = r600_bytecode_add_alu(ctx->bc, &alu); 5345 if (r) 5346 return r; 5347 } 5348 5349 if (ctx->bc->chip_class == CAYMAN) { 5350 for (i = 0; i < 3; i++) { 5351 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5352 alu.op = ALU_OP1_RECIP_IEEE; 5353 alu.src[0].sel = ctx->temp_reg; 5354 alu.src[0].chan = 1; 5355 5356 alu.dst.sel = ctx->temp_reg; 5357 alu.dst.chan = i; 5358 if (i == 1) 5359 alu.dst.write = 1; 5360 if (i == 2) 5361 alu.last = 1; 5362 5363 r = r600_bytecode_add_alu(ctx->bc, &alu); 5364 if (r) 5365 return r; 5366 } 5367 } else { 5368 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5369 alu.op = ALU_OP1_RECIP_IEEE; 5370 alu.src[0].sel = ctx->temp_reg; 5371 alu.src[0].chan = 1; 5372 5373 alu.dst.sel = ctx->temp_reg; 5374 alu.dst.chan = 1; 5375 alu.dst.write = 1; 5376 alu.last = 1; 5377 5378 r = r600_bytecode_add_alu(ctx->bc, &alu); 5379 if (r) 5380 return r; 5381 } 5382 5383 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5384 5385 alu.op = ALU_OP2_MUL; 5386 5387 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5388 r600_bytecode_src_set_abs(&alu.src[0]); 5389 5390 alu.src[1].sel = ctx->temp_reg; 5391 alu.src[1].chan = 1; 5392 5393 alu.dst.sel = ctx->temp_reg; 5394 alu.dst.chan = 1; 5395 alu.dst.write = 1; 5396 alu.last = 1; 5397 5398 r = r600_bytecode_add_alu(ctx->bc, &alu); 5399 if (r) 5400 return r; 5401 } 5402 5403 /* result.z = log2(|src|);*/ 5404 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 5405 if (ctx->bc->chip_class == CAYMAN) { 5406 for (i = 0; i < 3; i++) { 5407 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5408 5409 alu.op = ALU_OP1_LOG_IEEE; 5410 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5411 r600_bytecode_src_set_abs(&alu.src[0]); 5412 5413 alu.dst.sel = ctx->temp_reg; 5414 if (i == 2) 5415 alu.dst.write = 1; 5416 alu.dst.chan = i; 5417 if (i == 2) 5418 alu.last = 1; 5419 5420 r = r600_bytecode_add_alu(ctx->bc, &alu); 5421 if (r) 5422 return r; 5423 } 5424 } else { 5425 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5426 5427 alu.op = ALU_OP1_LOG_IEEE; 5428 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5429 r600_bytecode_src_set_abs(&alu.src[0]); 5430 5431 alu.dst.sel = ctx->temp_reg; 5432 alu.dst.write = 1; 5433 alu.dst.chan = 2; 5434 alu.last = 1; 5435 5436 r = r600_bytecode_add_alu(ctx->bc, &alu); 5437 if (r) 5438 return r; 5439 } 5440 } 5441 5442 /* result.w = 1.0; */ 5443 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 5444 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5445 5446 alu.op = ALU_OP1_MOV; 5447 alu.src[0].sel = V_SQ_ALU_SRC_1; 5448 alu.src[0].chan = 0; 5449 5450 alu.dst.sel = ctx->temp_reg; 5451 alu.dst.chan = 3; 5452 alu.dst.write = 1; 5453 alu.last = 1; 5454 5455 r = r600_bytecode_add_alu(ctx->bc, &alu); 5456 if (r) 5457 return r; 5458 } 5459 5460 return tgsi_helper_copy(ctx, inst); 5461} 5462 5463static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 5464{ 5465 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5466 struct r600_bytecode_alu alu; 5467 int r; 5468 5469 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5470 5471 switch (inst->Instruction.Opcode) { 5472 case TGSI_OPCODE_ARL: 5473 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 5474 break; 5475 case TGSI_OPCODE_ARR: 5476 alu.op = ALU_OP1_FLT_TO_INT; 5477 break; 5478 case TGSI_OPCODE_UARL: 5479 alu.op = ALU_OP1_MOV; 5480 break; 5481 default: 5482 assert(0); 5483 return -1; 5484 } 5485 5486 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5487 alu.last = 1; 5488 alu.dst.sel = ctx->bc->ar_reg; 5489 alu.dst.write = 1; 5490 r = r600_bytecode_add_alu(ctx->bc, &alu); 5491 if (r) 5492 return r; 5493 5494 ctx->bc->ar_loaded = 0; 5495 return 0; 5496} 5497static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 5498{ 5499 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5500 struct r600_bytecode_alu alu; 5501 int r; 5502 5503 switch (inst->Instruction.Opcode) { 5504 case TGSI_OPCODE_ARL: 5505 memset(&alu, 0, sizeof(alu)); 5506 alu.op = ALU_OP1_FLOOR; 5507 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5508 alu.dst.sel = ctx->bc->ar_reg; 5509 alu.dst.write = 1; 5510 alu.last = 1; 5511 5512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5513 return r; 5514 5515 memset(&alu, 0, sizeof(alu)); 5516 alu.op = ALU_OP1_FLT_TO_INT; 5517 alu.src[0].sel = ctx->bc->ar_reg; 5518 alu.dst.sel = ctx->bc->ar_reg; 5519 alu.dst.write = 1; 5520 alu.last = 1; 5521 5522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5523 return r; 5524 break; 5525 case TGSI_OPCODE_ARR: 5526 memset(&alu, 0, sizeof(alu)); 5527 alu.op = ALU_OP1_FLT_TO_INT; 5528 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5529 alu.dst.sel = ctx->bc->ar_reg; 5530 alu.dst.write = 1; 5531 alu.last = 1; 5532 5533 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5534 return r; 5535 break; 5536 case TGSI_OPCODE_UARL: 5537 memset(&alu, 0, sizeof(alu)); 5538 alu.op = ALU_OP1_MOV; 5539 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5540 alu.dst.sel = ctx->bc->ar_reg; 5541 alu.dst.write = 1; 5542 alu.last = 1; 5543 5544 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5545 return r; 5546 break; 5547 default: 5548 assert(0); 5549 return -1; 5550 } 5551 5552 ctx->bc->ar_loaded = 0; 5553 return 0; 5554} 5555 5556static int tgsi_opdst(struct r600_shader_ctx *ctx) 5557{ 5558 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5559 struct r600_bytecode_alu alu; 5560 int i, r = 0; 5561 5562 for (i = 0; i < 4; i++) { 5563 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5564 5565 alu.op = ALU_OP2_MUL; 5566 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5567 5568 if (i == 0 || i == 3) { 5569 alu.src[0].sel = V_SQ_ALU_SRC_1; 5570 } else { 5571 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5572 } 5573 5574 if (i == 0 || i == 2) { 5575 alu.src[1].sel = V_SQ_ALU_SRC_1; 5576 } else { 5577 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5578 } 5579 if (i == 3) 5580 alu.last = 1; 5581 r = r600_bytecode_add_alu(ctx->bc, &alu); 5582 if (r) 5583 return r; 5584 } 5585 return 0; 5586} 5587 5588static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type) 5589{ 5590 struct r600_bytecode_alu alu; 5591 int r; 5592 5593 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5594 alu.op = opcode; 5595 alu.execute_mask = 1; 5596 alu.update_pred = 1; 5597 5598 alu.dst.sel = ctx->temp_reg; 5599 alu.dst.write = 1; 5600 alu.dst.chan = 0; 5601 5602 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5603 alu.src[1].sel = V_SQ_ALU_SRC_0; 5604 alu.src[1].chan = 0; 5605 5606 alu.last = 1; 5607 5608 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 5609 if (r) 5610 return r; 5611 return 0; 5612} 5613 5614static int pops(struct r600_shader_ctx *ctx, int pops) 5615{ 5616 unsigned force_pop = ctx->bc->force_add_cf; 5617 5618 if (!force_pop) { 5619 int alu_pop = 3; 5620 if (ctx->bc->cf_last) { 5621 if (ctx->bc->cf_last->op == CF_OP_ALU) 5622 alu_pop = 0; 5623 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 5624 alu_pop = 1; 5625 } 5626 alu_pop += pops; 5627 if (alu_pop == 1) { 5628 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 5629 ctx->bc->force_add_cf = 1; 5630 } else if (alu_pop == 2) { 5631 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 5632 ctx->bc->force_add_cf = 1; 5633 } else { 5634 force_pop = 1; 5635 } 5636 } 5637 5638 if (force_pop) { 5639 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 5640 ctx->bc->cf_last->pop_count = pops; 5641 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 5642 } 5643 5644 return 0; 5645} 5646 5647static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx, 5648 unsigned reason) 5649{ 5650 struct r600_stack_info *stack = &ctx->bc->stack; 5651 unsigned elements, entries; 5652 5653 unsigned entry_size = stack->entry_size; 5654 5655 elements = (stack->loop + stack->push_wqm ) * entry_size; 5656 elements += stack->push; 5657 5658 switch (ctx->bc->chip_class) { 5659 case R600: 5660 case R700: 5661 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 5662 * the stack must be reserved to hold the current active/continue 5663 * masks */ 5664 if (reason == FC_PUSH_VPM) { 5665 elements += 2; 5666 } 5667 break; 5668 5669 case CAYMAN: 5670 /* r9xx: any stack operation on empty stack consumes 2 additional 5671 * elements */ 5672 elements += 2; 5673 5674 /* fallthrough */ 5675 /* FIXME: do the two elements added above cover the cases for the 5676 * r8xx+ below? */ 5677 5678 case EVERGREEN: 5679 /* r8xx+: 2 extra elements are not always required, but one extra 5680 * element must be added for each of the following cases: 5681 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 5682 * stack usage. 5683 * (Currently we don't use ALU_ELSE_AFTER.) 5684 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 5685 * PUSH instruction executed. 5686 * 5687 * NOTE: it seems we also need to reserve additional element in some 5688 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 5689 * then STACK_SIZE should be 2 instead of 1 */ 5690 if (reason == FC_PUSH_VPM) { 5691 elements += 1; 5692 } 5693 break; 5694 5695 default: 5696 assert(0); 5697 break; 5698 } 5699 5700 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 5701 * for all chips, so we use 4 in the final formula, not the real entry_size 5702 * for the chip */ 5703 entry_size = 4; 5704 5705 entries = (elements + (entry_size - 1)) / entry_size; 5706 5707 if (entries > stack->max_entries) 5708 stack->max_entries = entries; 5709} 5710 5711static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 5712{ 5713 switch(reason) { 5714 case FC_PUSH_VPM: 5715 --ctx->bc->stack.push; 5716 assert(ctx->bc->stack.push >= 0); 5717 break; 5718 case FC_PUSH_WQM: 5719 --ctx->bc->stack.push_wqm; 5720 assert(ctx->bc->stack.push_wqm >= 0); 5721 break; 5722 case FC_LOOP: 5723 --ctx->bc->stack.loop; 5724 assert(ctx->bc->stack.loop >= 0); 5725 break; 5726 default: 5727 assert(0); 5728 break; 5729 } 5730} 5731 5732static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 5733{ 5734 switch (reason) { 5735 case FC_PUSH_VPM: 5736 ++ctx->bc->stack.push; 5737 break; 5738 case FC_PUSH_WQM: 5739 ++ctx->bc->stack.push_wqm; 5740 case FC_LOOP: 5741 ++ctx->bc->stack.loop; 5742 break; 5743 default: 5744 assert(0); 5745 } 5746 5747 callstack_update_max_depth(ctx, reason); 5748} 5749 5750static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 5751{ 5752 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 5753 5754 sp->mid = realloc((void *)sp->mid, 5755 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 5756 sp->mid[sp->num_mid] = ctx->bc->cf_last; 5757 sp->num_mid++; 5758} 5759 5760static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 5761{ 5762 ctx->bc->fc_sp++; 5763 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 5764 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 5765} 5766 5767static void fc_poplevel(struct r600_shader_ctx *ctx) 5768{ 5769 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp]; 5770 free(sp->mid); 5771 sp->mid = NULL; 5772 sp->num_mid = 0; 5773 sp->start = NULL; 5774 sp->type = 0; 5775 ctx->bc->fc_sp--; 5776} 5777 5778#if 0 5779static int emit_return(struct r600_shader_ctx *ctx) 5780{ 5781 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 5782 return 0; 5783} 5784 5785static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 5786{ 5787 5788 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 5789 ctx->bc->cf_last->pop_count = pops; 5790 /* XXX work out offset */ 5791 return 0; 5792} 5793 5794static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 5795{ 5796 return 0; 5797} 5798 5799static void emit_testflag(struct r600_shader_ctx *ctx) 5800{ 5801 5802} 5803 5804static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 5805{ 5806 emit_testflag(ctx); 5807 emit_jump_to_offset(ctx, 1, 4); 5808 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 5809 pops(ctx, ifidx + 1); 5810 emit_return(ctx); 5811} 5812 5813static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 5814{ 5815 emit_testflag(ctx); 5816 5817 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 5818 ctx->bc->cf_last->pop_count = 1; 5819 5820 fc_set_mid(ctx, fc_sp); 5821 5822 pops(ctx, 1); 5823} 5824#endif 5825 5826static int emit_if(struct r600_shader_ctx *ctx, int opcode) 5827{ 5828 int alu_type = CF_OP_ALU_PUSH_BEFORE; 5829 5830 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 5831 * LOOP_STARTxxx for nested loops may put the branch stack into a state 5832 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 5833 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 5834 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) { 5835 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 5836 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 5837 alu_type = CF_OP_ALU; 5838 } 5839 5840 emit_logic_pred(ctx, opcode, alu_type); 5841 5842 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 5843 5844 fc_pushlevel(ctx, FC_IF); 5845 5846 callstack_push(ctx, FC_PUSH_VPM); 5847 return 0; 5848} 5849 5850static int tgsi_if(struct r600_shader_ctx *ctx) 5851{ 5852 return emit_if(ctx, ALU_OP2_PRED_SETNE); 5853} 5854 5855static int tgsi_uif(struct r600_shader_ctx *ctx) 5856{ 5857 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT); 5858} 5859 5860static int tgsi_else(struct r600_shader_ctx *ctx) 5861{ 5862 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 5863 ctx->bc->cf_last->pop_count = 1; 5864 5865 fc_set_mid(ctx, ctx->bc->fc_sp); 5866 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id; 5867 return 0; 5868} 5869 5870static int tgsi_endif(struct r600_shader_ctx *ctx) 5871{ 5872 pops(ctx, 1); 5873 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) { 5874 R600_ERR("if/endif unbalanced in shader\n"); 5875 return -1; 5876 } 5877 5878 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) { 5879 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2; 5880 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1; 5881 } else { 5882 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2; 5883 } 5884 fc_poplevel(ctx); 5885 5886 callstack_pop(ctx, FC_PUSH_VPM); 5887 return 0; 5888} 5889 5890static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 5891{ 5892 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 5893 * limited to 4096 iterations, like the other LOOP_* instructions. */ 5894 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 5895 5896 fc_pushlevel(ctx, FC_LOOP); 5897 5898 /* check stack depth */ 5899 callstack_push(ctx, FC_LOOP); 5900 return 0; 5901} 5902 5903static int tgsi_endloop(struct r600_shader_ctx *ctx) 5904{ 5905 int i; 5906 5907 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 5908 5909 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) { 5910 R600_ERR("loop/endloop in shader code are not paired.\n"); 5911 return -EINVAL; 5912 } 5913 5914 /* fixup loop pointers - from r600isa 5915 LOOP END points to CF after LOOP START, 5916 LOOP START point to CF after LOOP END 5917 BRK/CONT point to LOOP END CF 5918 */ 5919 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2; 5920 5921 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2; 5922 5923 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) { 5924 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id; 5925 } 5926 /* XXX add LOOPRET support */ 5927 fc_poplevel(ctx); 5928 callstack_pop(ctx, FC_LOOP); 5929 return 0; 5930} 5931 5932static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 5933{ 5934 unsigned int fscp; 5935 5936 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 5937 { 5938 if (FC_LOOP == ctx->bc->fc_stack[fscp].type) 5939 break; 5940 } 5941 5942 if (fscp == 0) { 5943 R600_ERR("Break not inside loop/endloop pair\n"); 5944 return -EINVAL; 5945 } 5946 5947 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 5948 5949 fc_set_mid(ctx, fscp); 5950 5951 return 0; 5952} 5953 5954static int tgsi_gs_emit(struct r600_shader_ctx *ctx) 5955{ 5956 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 5957 emit_gs_ring_writes(ctx); 5958 5959 return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 5960} 5961 5962static int tgsi_umad(struct r600_shader_ctx *ctx) 5963{ 5964 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5965 struct r600_bytecode_alu alu; 5966 int i, j, k, r; 5967 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 5968 5969 /* src0 * src1 */ 5970 for (i = 0; i < lasti + 1; i++) { 5971 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 5972 continue; 5973 5974 if (ctx->bc->chip_class == CAYMAN) { 5975 for (j = 0 ; j < 4; j++) { 5976 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5977 5978 alu.op = ALU_OP2_MULLO_UINT; 5979 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) { 5980 r600_bytecode_src(&alu.src[k], &ctx->src[k], i); 5981 } 5982 tgsi_dst(ctx, &inst->Dst[0], j, &alu.dst); 5983 alu.dst.sel = ctx->temp_reg; 5984 alu.dst.write = (j == i); 5985 if (j == 3) 5986 alu.last = 1; 5987 r = r600_bytecode_add_alu(ctx->bc, &alu); 5988 if (r) 5989 return r; 5990 } 5991 } else { 5992 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5993 5994 alu.dst.chan = i; 5995 alu.dst.sel = ctx->temp_reg; 5996 alu.dst.write = 1; 5997 5998 alu.op = ALU_OP2_MULLO_UINT; 5999 for (j = 0; j < 2; j++) { 6000 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 6001 } 6002 6003 alu.last = 1; 6004 r = r600_bytecode_add_alu(ctx->bc, &alu); 6005 if (r) 6006 return r; 6007 } 6008 } 6009 6010 6011 for (i = 0; i < lasti + 1; i++) { 6012 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 6013 continue; 6014 6015 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6016 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6017 6018 alu.op = ALU_OP2_ADD_INT; 6019 6020 alu.src[0].sel = ctx->temp_reg; 6021 alu.src[0].chan = i; 6022 6023 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6024 if (i == lasti) { 6025 alu.last = 1; 6026 } 6027 r = r600_bytecode_add_alu(ctx->bc, &alu); 6028 if (r) 6029 return r; 6030 } 6031 return 0; 6032} 6033 6034static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 6035 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_r600_arl}, 6036 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2}, 6037 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit}, 6038 6039 /* XXX: 6040 * For state trackers other than OpenGL, we'll want to use 6041 * _RECIP_IEEE instead. 6042 */ 6043 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate}, 6044 6045 {TGSI_OPCODE_RSQ, 0, ALU_OP0_NOP, tgsi_rsq}, 6046 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp}, 6047 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log}, 6048 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2}, 6049 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2}, 6050 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp}, 6051 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp}, 6052 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst}, 6053 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2}, 6054 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2}, 6055 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap}, 6056 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2}, 6057 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3}, 6058 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2}, 6059 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp}, 6060 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6061 /* gap */ 6062 {20, 0, ALU_OP0_NOP, tgsi_unsupported}, 6063 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported}, 6064 /* gap */ 6065 {22, 0, ALU_OP0_NOP, tgsi_unsupported}, 6066 {23, 0, ALU_OP0_NOP, tgsi_unsupported}, 6067 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2}, 6068 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6069 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2}, 6070 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2}, 6071 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 6072 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 6073 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow}, 6074 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd}, 6075 /* gap */ 6076 {32, 0, ALU_OP0_NOP, tgsi_unsupported}, 6077 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2}, 6078 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6079 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp}, 6080 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig}, 6081 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 6082 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 6083 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 6084 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6085 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6086 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6087 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6088 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6089 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2}, 6090 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6091 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2}, 6092 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig}, 6093 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap}, 6094 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2}, 6095 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6096 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6097 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex}, 6098 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6099 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6100 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6101 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6102 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6103 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported}, 6104 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6105 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_r600_arl}, 6106 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6107 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6108 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported}, 6109 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg}, 6110 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp}, 6111 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs}, 6112 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6113 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6114 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported}, 6115 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp}, 6116 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6117 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 6118 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if}, 6119 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif}, 6120 {76, 0, ALU_OP0_NOP, tgsi_unsupported}, 6121 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else}, 6122 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif}, 6123 /* gap */ 6124 {79, 0, ALU_OP0_NOP, tgsi_unsupported}, 6125 {80, 0, ALU_OP0_NOP, tgsi_unsupported}, 6126 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6127 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6128 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2}, 6129 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 6130 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2}, 6131 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2}, 6132 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2_trans}, 6133 /* gap */ 6134 {88, 0, ALU_OP0_NOP, tgsi_unsupported}, 6135 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2}, 6136 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2}, 6137 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod}, 6138 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2}, 6139 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6140 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex}, 6141 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6142 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 6143 {TGSI_OPCODE_EMIT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6144 {TGSI_OPCODE_ENDPRIM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6145 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop}, 6146 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6147 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop}, 6148 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6149 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6150 /* gap */ 6151 {104, 0, ALU_OP0_NOP, tgsi_unsupported}, 6152 {105, 0, ALU_OP0_NOP, tgsi_unsupported}, 6153 {106, 0, ALU_OP0_NOP, tgsi_unsupported}, 6154 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6155 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2}, 6156 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2}, 6157 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 6158 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 6159 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported}, 6160 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported}, 6161 /* gap */ 6162 {114, 0, ALU_OP0_NOP, tgsi_unsupported}, 6163 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6164 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 6165 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 6166 /* gap */ 6167 {118, 0, ALU_OP0_NOP, tgsi_unsupported}, 6168 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 6169 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv}, 6170 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2}, 6171 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2}, 6172 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg}, 6173 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2}, 6174 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2_trans}, 6175 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap}, 6176 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 6177 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 6178 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2}, 6179 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv}, 6180 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad}, 6181 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2}, 6182 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2}, 6183 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod}, 6184 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 6185 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2}, 6186 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2}, 6187 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2_trans}, 6188 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 6189 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2_swap}, 6190 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6191 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6192 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6193 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6194 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported}, 6195 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported}, 6196 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported}, 6197 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported}, 6198 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported}, 6199 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported}, 6200 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported}, 6201 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported}, 6202 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported}, 6203 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported}, 6204 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported}, 6205 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported}, 6206 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_r600_arl}, 6207 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp}, 6208 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs}, 6209 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg}, 6210 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6211 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6212 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6213 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6214 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6215 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported}, 6216 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6217 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported}, 6218 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported}, 6219 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6220 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6221 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6222 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6223 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6224 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6225 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6226 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6227 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6228 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6229 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported}, 6230}; 6231 6232static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 6233 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6234 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2}, 6235 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit}, 6236 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 6237 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq}, 6238 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp}, 6239 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log}, 6240 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2}, 6241 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2}, 6242 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp}, 6243 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp}, 6244 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst}, 6245 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2}, 6246 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2}, 6247 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap}, 6248 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2}, 6249 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3}, 6250 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2}, 6251 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp}, 6252 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6253 /* gap */ 6254 {20, 0, ALU_OP0_NOP, tgsi_unsupported}, 6255 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported}, 6256 /* gap */ 6257 {22, 0, ALU_OP0_NOP, tgsi_unsupported}, 6258 {23, 0, ALU_OP0_NOP, tgsi_unsupported}, 6259 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2}, 6260 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6261 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2}, 6262 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2}, 6263 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 6264 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 6265 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow}, 6266 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd}, 6267 /* gap */ 6268 {32, 0, ALU_OP0_NOP, tgsi_unsupported}, 6269 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2}, 6270 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6271 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp}, 6272 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig}, 6273 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 6274 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 6275 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 6276 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6277 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6278 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6279 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6280 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6281 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2}, 6282 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6283 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2}, 6284 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig}, 6285 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap}, 6286 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2}, 6287 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6288 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6289 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex}, 6290 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6291 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6292 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6293 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6294 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6295 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported}, 6296 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6297 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6298 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6299 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6300 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported}, 6301 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg}, 6302 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp}, 6303 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs}, 6304 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6305 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6306 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported}, 6307 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp}, 6308 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6309 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 6310 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if}, 6311 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif}, 6312 {76, 0, ALU_OP0_NOP, tgsi_unsupported}, 6313 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else}, 6314 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif}, 6315 /* gap */ 6316 {79, 0, ALU_OP0_NOP, tgsi_unsupported}, 6317 {80, 0, ALU_OP0_NOP, tgsi_unsupported}, 6318 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6319 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6320 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2}, 6321 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 6322 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2}, 6323 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2}, 6324 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2}, 6325 /* gap */ 6326 {88, 0, ALU_OP0_NOP, tgsi_unsupported}, 6327 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2}, 6328 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2}, 6329 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod}, 6330 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2}, 6331 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6332 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex}, 6333 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6334 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 6335 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 6336 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit}, 6337 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop}, 6338 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6339 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop}, 6340 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6341 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6342 /* gap */ 6343 {104, 0, ALU_OP0_NOP, tgsi_unsupported}, 6344 {105, 0, ALU_OP0_NOP, tgsi_unsupported}, 6345 {106, 0, ALU_OP0_NOP, tgsi_unsupported}, 6346 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6347 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2}, 6348 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2}, 6349 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 6350 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 6351 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported}, 6352 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported}, 6353 /* gap */ 6354 {114, 0, ALU_OP0_NOP, tgsi_unsupported}, 6355 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6356 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 6357 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 6358 /* gap */ 6359 {118, 0, ALU_OP0_NOP, tgsi_unsupported}, 6360 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_f2i}, 6361 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv}, 6362 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2}, 6363 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2}, 6364 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg}, 6365 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2}, 6366 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2}, 6367 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap}, 6368 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 6369 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 6370 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2}, 6371 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv}, 6372 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad}, 6373 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2}, 6374 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2}, 6375 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod}, 6376 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 6377 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2}, 6378 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2}, 6379 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2}, 6380 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 6381 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2}, 6382 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6383 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6384 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6385 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6386 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported}, 6387 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported}, 6388 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported}, 6389 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported}, 6390 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported}, 6391 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported}, 6392 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported}, 6393 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported}, 6394 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported}, 6395 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported}, 6396 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported}, 6397 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported}, 6398 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl}, 6399 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp}, 6400 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs}, 6401 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg}, 6402 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6403 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6404 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6405 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6406 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6407 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported}, 6408 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6409 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported}, 6410 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported}, 6411 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6412 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6413 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6414 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6415 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6416 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6417 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6418 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6419 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6420 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6421 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported}, 6422}; 6423 6424static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 6425 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6426 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2}, 6427 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit}, 6428 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 6429 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 6430 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp}, 6431 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log}, 6432 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2}, 6433 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2}, 6434 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp}, 6435 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp}, 6436 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst}, 6437 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2}, 6438 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2}, 6439 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap}, 6440 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2}, 6441 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3}, 6442 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2}, 6443 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp}, 6444 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6445 /* gap */ 6446 {20, 0, ALU_OP0_NOP, tgsi_unsupported}, 6447 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported}, 6448 /* gap */ 6449 {22, 0, ALU_OP0_NOP, tgsi_unsupported}, 6450 {23, 0, ALU_OP0_NOP, tgsi_unsupported}, 6451 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2}, 6452 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6453 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2}, 6454 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2}, 6455 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 6456 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 6457 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, cayman_pow}, 6458 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd}, 6459 /* gap */ 6460 {32, 0, ALU_OP0_NOP, tgsi_unsupported}, 6461 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2}, 6462 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6463 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp}, 6464 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, cayman_trig}, 6465 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 6466 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 6467 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 6468 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6469 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6470 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6471 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6472 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6473 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2}, 6474 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6475 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2}, 6476 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, cayman_trig}, 6477 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap}, 6478 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2}, 6479 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6480 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6481 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex}, 6482 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6483 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported}, 6484 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported}, 6485 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported}, 6486 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6487 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported}, 6488 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6489 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl}, 6490 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6491 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported}, 6492 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported}, 6493 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg}, 6494 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp}, 6495 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs}, 6496 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6497 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported}, 6498 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported}, 6499 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp}, 6500 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6501 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 6502 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if}, 6503 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif}, 6504 {76, 0, ALU_OP0_NOP, tgsi_unsupported}, 6505 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else}, 6506 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif}, 6507 /* gap */ 6508 {79, 0, ALU_OP0_NOP, tgsi_unsupported}, 6509 {80, 0, ALU_OP0_NOP, tgsi_unsupported}, 6510 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6511 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported}, 6512 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2}, 6513 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2}, 6514 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2}, 6515 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2}, 6516 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2}, 6517 /* gap */ 6518 {88, 0, ALU_OP0_NOP, tgsi_unsupported}, 6519 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2}, 6520 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2}, 6521 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod}, 6522 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2}, 6523 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6524 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex}, 6525 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6526 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 6527 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 6528 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit}, 6529 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop}, 6530 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6531 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop}, 6532 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported}, 6533 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 6534 /* gap */ 6535 {104, 0, ALU_OP0_NOP, tgsi_unsupported}, 6536 {105, 0, ALU_OP0_NOP, tgsi_unsupported}, 6537 {106, 0, ALU_OP0_NOP, tgsi_unsupported}, 6538 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported}, 6539 /* gap */ 6540 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2}, 6541 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2}, 6542 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 6543 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 6544 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported}, 6545 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported}, 6546 /* gap */ 6547 {114, 0, ALU_OP0_NOP, tgsi_unsupported}, 6548 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported}, 6549 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 6550 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 6551 /* gap */ 6552 {118, 0, ALU_OP0_NOP, tgsi_unsupported}, 6553 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2}, 6554 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv}, 6555 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2}, 6556 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2}, 6557 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg}, 6558 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2}, 6559 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2}, 6560 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap}, 6561 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2}, 6562 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2}, 6563 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2}, 6564 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv}, 6565 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad}, 6566 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2}, 6567 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2}, 6568 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod}, 6569 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 6570 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2}, 6571 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2}, 6572 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2}, 6573 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 6574 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2}, 6575 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6576 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6577 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported}, 6578 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported}, 6579 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported}, 6580 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported}, 6581 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported}, 6582 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported}, 6583 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported}, 6584 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported}, 6585 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported}, 6586 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported}, 6587 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported}, 6588 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported}, 6589 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported}, 6590 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported}, 6591 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl}, 6592 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp}, 6593 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs}, 6594 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg}, 6595 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6596 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6597 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6598 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6599 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported}, 6600 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported}, 6601 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported}, 6602 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported}, 6603 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported}, 6604 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported}, 6605 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6606 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported}, 6607 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6608 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6609 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported}, 6610 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported}, 6611 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex}, 6612 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex}, 6613 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex}, 6614 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported}, 6615}; 6616