evergreen_compute.c revision aeb2be3a2f1839b91532b178b997b20ddb69eb13
1/* 2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Adam Rak <adam.rak@streamnovation.com> 25 */ 26 27#include <stdio.h> 28#include <errno.h> 29#include "pipe/p_defines.h" 30#include "pipe/p_state.h" 31#include "pipe/p_context.h" 32#include "util/u_blitter.h" 33#include "util/list.h" 34#include "util/u_transfer.h" 35#include "util/u_surface.h" 36#include "util/u_pack_color.h" 37#include "util/u_memory.h" 38#include "util/u_inlines.h" 39#include "util/u_framebuffer.h" 40#include "pipebuffer/pb_buffer.h" 41#include "evergreend.h" 42#include "r600_shader.h" 43#include "r600_pipe.h" 44#include "r600_formats.h" 45#include "evergreen_compute.h" 46#include "evergreen_compute_internal.h" 47#include "compute_memory_pool.h" 48#include "sb/sb_public.h" 49#ifdef HAVE_OPENCL 50#include "radeon/radeon_llvm_util.h" 51#endif 52#include "radeon/radeon_elf_util.h" 53#include <inttypes.h> 54 55/** 56RAT0 is for global binding write 57VTX1 is for global binding read 58 59for wrting images RAT1... 60for reading images TEX2... 61 TEX2-RAT1 is paired 62 63TEX2... consumes the same fetch resources, that VTX2... would consume 64 65CONST0 and VTX0 is for parameters 66 CONST0 is binding smaller input parameter buffer, and for constant indexing, 67 also constant cached 68 VTX0 is for indirect/non-constant indexing, or if the input is bigger than 69 the constant cache can handle 70 71RAT-s are limited to 12, so we can only bind at most 11 texture for writing 72because we reserve RAT0 for global bindings. With byteaddressing enabled, 73we should reserve another one too.=> 10 image binding for writing max. 74 75from Nvidia OpenCL: 76 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 77 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 78 79so 10 for writing is enough. 176 is the max for reading according to the docs 80 81writable images should be listed first < 10, so their id corresponds to RAT(id+1) 82writable images will consume TEX slots, VTX slots too because of linear indexing 83 84*/ 85 86struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen, 87 unsigned size) 88{ 89 struct pipe_resource *buffer = NULL; 90 assert(size); 91 92 buffer = pipe_buffer_create((struct pipe_screen*) screen, 93 PIPE_BIND_CUSTOM, 94 PIPE_USAGE_IMMUTABLE, 95 size); 96 97 return (struct r600_resource *)buffer; 98} 99 100 101static void evergreen_set_rat(struct r600_pipe_compute *pipe, 102 unsigned id, 103 struct r600_resource *bo, 104 int start, 105 int size) 106{ 107 struct pipe_surface rat_templ; 108 struct r600_surface *surf = NULL; 109 struct r600_context *rctx = NULL; 110 111 assert(id < 12); 112 assert((size & 3) == 0); 113 assert((start & 0xFF) == 0); 114 115 rctx = pipe->ctx; 116 117 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id); 118 119 /* Create the RAT surface */ 120 memset(&rat_templ, 0, sizeof(rat_templ)); 121 rat_templ.format = PIPE_FORMAT_R32_UINT; 122 rat_templ.u.tex.level = 0; 123 rat_templ.u.tex.first_layer = 0; 124 rat_templ.u.tex.last_layer = 0; 125 126 /* Add the RAT the list of color buffers */ 127 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface( 128 (struct pipe_context *)pipe->ctx, 129 (struct pipe_resource *)bo, &rat_templ); 130 131 /* Update the number of color buffers */ 132 pipe->ctx->framebuffer.state.nr_cbufs = 133 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs); 134 135 /* Update the cb_target_mask 136 * XXX: I think this is a potential spot for bugs once we start doing 137 * GL interop. cb_target_mask may be modified in the 3D sections 138 * of this driver. */ 139 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4)); 140 141 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id]; 142 evergreen_init_color_surface_rat(rctx, surf); 143} 144 145static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx, 146 unsigned vb_index, 147 unsigned offset, 148 struct pipe_resource *buffer) 149{ 150 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state; 151 struct pipe_vertex_buffer *vb = &state->vb[vb_index]; 152 vb->stride = 1; 153 vb->buffer_offset = offset; 154 vb->buffer = buffer; 155 vb->user_buffer = NULL; 156 157 /* The vertex instructions in the compute shaders use the texture cache, 158 * so we need to invalidate it. */ 159 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE; 160 state->enabled_mask |= 1 << vb_index; 161 state->dirty_mask |= 1 << vb_index; 162 r600_mark_atom_dirty(rctx, &state->atom); 163} 164 165static void evergreen_cs_set_constant_buffer(struct r600_context *rctx, 166 unsigned cb_index, 167 unsigned offset, 168 unsigned size, 169 struct pipe_resource *buffer) 170{ 171 struct pipe_constant_buffer cb; 172 cb.buffer_size = size; 173 cb.buffer_offset = offset; 174 cb.buffer = buffer; 175 cb.user_buffer = NULL; 176 177 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb); 178} 179 180static const struct u_resource_vtbl r600_global_buffer_vtbl = 181{ 182 u_default_resource_get_handle, /* get_handle */ 183 r600_compute_global_buffer_destroy, /* resource_destroy */ 184 r600_compute_global_transfer_map, /* transfer_map */ 185 r600_compute_global_transfer_flush_region,/* transfer_flush_region */ 186 r600_compute_global_transfer_unmap, /* transfer_unmap */ 187 r600_compute_global_transfer_inline_write /* transfer_inline_write */ 188}; 189 190/* We need to define these R600 registers here, because we can't include 191 * evergreend.h and r600d.h. 192 */ 193#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 194#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 195 196#ifdef HAVE_OPENCL 197 198static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary, 199 struct r600_bytecode *bc, 200 uint64_t symbol_offset, 201 boolean *use_kill) 202{ 203 unsigned i; 204 const unsigned char *config = 205 radeon_shader_binary_config_start(binary, symbol_offset); 206 207 for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 208 unsigned reg = 209 util_le32_to_cpu(*(uint32_t*)(config + i)); 210 unsigned value = 211 util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 212 switch (reg) { 213 /* R600 / R700 */ 214 case R_028850_SQ_PGM_RESOURCES_PS: 215 case R_028868_SQ_PGM_RESOURCES_VS: 216 /* Evergreen / Northern Islands */ 217 case R_028844_SQ_PGM_RESOURCES_PS: 218 case R_028860_SQ_PGM_RESOURCES_VS: 219 case R_0288D4_SQ_PGM_RESOURCES_LS: 220 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value)); 221 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value)); 222 break; 223 case R_02880C_DB_SHADER_CONTROL: 224 *use_kill = G_02880C_KILL_ENABLE(value); 225 break; 226 case R_0288E8_SQ_LDS_ALLOC: 227 bc->nlds_dw = value; 228 break; 229 } 230 } 231} 232 233static unsigned r600_create_shader(struct r600_bytecode *bc, 234 const struct radeon_shader_binary *binary, 235 boolean *use_kill) 236 237{ 238 assert(binary->code_size % 4 == 0); 239 bc->bytecode = CALLOC(1, binary->code_size); 240 memcpy(bc->bytecode, binary->code, binary->code_size); 241 bc->ndw = binary->code_size / 4; 242 243 r600_shader_binary_read_config(binary, bc, 0, use_kill); 244 return 0; 245} 246 247#endif 248 249static void r600_destroy_shader(struct r600_bytecode *bc) 250{ 251 FREE(bc->bytecode); 252} 253 254void *evergreen_create_compute_state(struct pipe_context *ctx_, 255 const const struct pipe_compute_state *cso) 256{ 257 struct r600_context *rctx = (struct r600_context *)ctx_; 258 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); 259#ifdef HAVE_OPENCL 260 const struct pipe_llvm_program_header *header; 261 const char *code; 262 void *p; 263 boolean use_kill; 264 265 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n"); 266 header = cso->prog; 267 code = cso->prog + sizeof(struct pipe_llvm_program_header); 268 radeon_shader_binary_init(&shader->binary); 269 radeon_elf_read(code, header->num_bytes, &shader->binary); 270 r600_create_shader(&shader->bc, &shader->binary, &use_kill); 271 272 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen, 273 shader->bc.ndw * 4); 274 p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE); 275 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); 276 rctx->b.ws->buffer_unmap(shader->code_bo->buf); 277#endif 278 279 shader->ctx = rctx; 280 shader->local_size = cso->req_local_mem; 281 shader->private_size = cso->req_private_mem; 282 shader->input_size = cso->req_input_mem; 283 284 return shader; 285} 286 287void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state) 288{ 289 struct r600_context *rctx = (struct r600_context *)ctx_; 290 struct r600_pipe_compute *shader = state; 291 292 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n"); 293 294 if (!shader) 295 return; 296 297 radeon_shader_binary_clean(&shader->binary); 298 r600_destroy_shader(&shader->bc); 299 300 /* TODO destroy shader->code_bo, shader->const_bo 301 * we'll need something like r600_buffer_free */ 302 FREE(shader); 303} 304 305static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state) 306{ 307 struct r600_context *rctx = (struct r600_context *)ctx_; 308 309 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n"); 310 311 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state; 312} 313 314/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit 315 * kernel parameters there are implicit parameters that need to be stored 316 * in the vertex buffer as well. Here is how these parameters are organized in 317 * the buffer: 318 * 319 * DWORDS 0-2: Number of work groups in each dimension (x,y,z) 320 * DWORDS 3-5: Number of global work items in each dimension (x,y,z) 321 * DWORDS 6-8: Number of work items within each work group in each dimension 322 * (x,y,z) 323 * DWORDS 9+ : Kernel parameters 324 */ 325void evergreen_compute_upload_input(struct pipe_context *ctx_, 326 const uint *block_layout, 327 const uint *grid_layout, 328 const void *input) 329{ 330 struct r600_context *rctx = (struct r600_context *)ctx_; 331 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 332 unsigned i; 333 /* We need to reserve 9 dwords (36 bytes) for implicit kernel 334 * parameters. 335 */ 336 unsigned input_size = shader->input_size + 36; 337 uint32_t *num_work_groups_start; 338 uint32_t *global_size_start; 339 uint32_t *local_size_start; 340 uint32_t *kernel_parameters_start; 341 struct pipe_box box; 342 struct pipe_transfer *transfer = NULL; 343 344 if (shader->input_size == 0) { 345 return; 346 } 347 348 if (!shader->kernel_param) { 349 /* Add space for the grid dimensions */ 350 shader->kernel_param = (struct r600_resource *) 351 pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM, 352 PIPE_USAGE_IMMUTABLE, input_size); 353 } 354 355 u_box_1d(0, input_size, &box); 356 num_work_groups_start = ctx_->transfer_map(ctx_, 357 (struct pipe_resource*)shader->kernel_param, 358 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE, 359 &box, &transfer); 360 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4)); 361 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4); 362 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4); 363 364 /* Copy the work group size */ 365 memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint)); 366 367 /* Copy the global size */ 368 for (i = 0; i < 3; i++) { 369 global_size_start[i] = grid_layout[i] * block_layout[i]; 370 } 371 372 /* Copy the local dimensions */ 373 memcpy(local_size_start, block_layout, 3 * sizeof(uint)); 374 375 /* Copy the kernel inputs */ 376 memcpy(kernel_parameters_start, input, shader->input_size); 377 378 for (i = 0; i < (input_size / 4); i++) { 379 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i, 380 ((unsigned*)num_work_groups_start)[i]); 381 } 382 383 ctx_->transfer_unmap(ctx_, transfer); 384 385 /* ID=0 is reserved for the parameters */ 386 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size, 387 (struct pipe_resource*)shader->kernel_param); 388} 389 390static void evergreen_emit_direct_dispatch(struct r600_context *rctx, 391 const uint *block_layout, 392 const uint *grid_layout) 393{ 394 int i; 395 struct radeon_winsys_cs *cs = rctx->b.gfx.cs; 396 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 397 unsigned num_waves; 398 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes; 399 unsigned wave_divisor = (16 * num_pipes); 400 int group_size = 1; 401 int grid_size = 1; 402 unsigned lds_size = shader->local_size / 4 + 403 shader->bc.nlds_dw; 404 405 406 /* Calculate group_size/grid_size */ 407 for (i = 0; i < 3; i++) { 408 group_size *= block_layout[i]; 409 } 410 411 for (i = 0; i < 3; i++) { 412 grid_size *= grid_layout[i]; 413 } 414 415 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ 416 num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + 417 wave_divisor - 1) / wave_divisor; 418 419 COMPUTE_DBG(rctx->screen, "Using %u pipes, " 420 "%u wavefronts per thread block, " 421 "allocating %u dwords lds.\n", 422 num_pipes, num_waves, lds_size); 423 424 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); 425 426 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); 427 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ 428 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ 429 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ 430 431 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, 432 group_size); 433 434 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); 435 radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ 436 radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ 437 radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ 438 439 if (rctx->b.chip_class < CAYMAN) { 440 assert(lds_size <= 8192); 441 } else { 442 /* Cayman appears to have a slightly smaller limit, see the 443 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ 444 assert(lds_size <= 8160); 445 } 446 447 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC, 448 lds_size | (num_waves << 14)); 449 450 /* Dispatch packet */ 451 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); 452 radeon_emit(cs, grid_layout[0]); 453 radeon_emit(cs, grid_layout[1]); 454 radeon_emit(cs, grid_layout[2]); 455 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ 456 radeon_emit(cs, 1); 457} 458 459static void compute_emit_cs(struct r600_context *rctx, 460 const uint *block_layout, 461 const uint *grid_layout) 462{ 463 struct radeon_winsys_cs *cs = rctx->b.gfx.cs; 464 unsigned i; 465 466 /* make sure that the gfx ring is only one active */ 467 if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) { 468 rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); 469 } 470 471 /* Initialize all the compute-related registers. 472 * 473 * See evergreen_init_atom_start_compute_cs() in this file for the list 474 * of registers initialized by the start_compute_cs_cmd atom. 475 */ 476 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd); 477 478 /* emit config state */ 479 if (rctx->b.chip_class == EVERGREEN) 480 r600_emit_atom(rctx, &rctx->config_state.atom); 481 482 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; 483 r600_flush_emit(rctx); 484 485 /* Emit colorbuffers. */ 486 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ 487 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) { 488 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i]; 489 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, 490 (struct r600_resource*)cb->base.texture, 491 RADEON_USAGE_READWRITE, 492 RADEON_PRIO_SHADER_RW_BUFFER); 493 494 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); 495 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ 496 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ 497 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ 498 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ 499 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ 500 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ 501 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ 502 503 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ 504 radeon_emit(cs, reloc); 505 506 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ 507 radeon_emit(cs, reloc); 508 } 509 for (; i < 8 ; i++) 510 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 511 S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 512 for (; i < 12; i++) 513 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, 514 S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 515 516 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ 517 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, 518 rctx->compute_cb_target_mask); 519 520 521 /* Emit vertex buffer state */ 522 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask); 523 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom); 524 525 /* Emit constant buffer state */ 526 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); 527 528 /* Emit sampler state */ 529 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom); 530 531 /* Emit sampler view (texture resource) state */ 532 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom); 533 534 /* Emit compute shader state */ 535 r600_emit_atom(rctx, &rctx->cs_shader_state.atom); 536 537 /* Emit dispatch state and dispatch packet */ 538 evergreen_emit_direct_dispatch(rctx, block_layout, grid_layout); 539 540 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff 541 */ 542 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | 543 R600_CONTEXT_INV_VERTEX_CACHE | 544 R600_CONTEXT_INV_TEX_CACHE; 545 r600_flush_emit(rctx); 546 rctx->b.flags = 0; 547 548 if (rctx->b.chip_class >= CAYMAN) { 549 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); 550 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4); 551 /* DEALLOC_STATE prevents the GPU from hanging when a 552 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT 553 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. 554 */ 555 cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0); 556 cs->buf[cs->cdw++] = 0; 557 } 558 559#if 0 560 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw); 561 for (i = 0; i < cs->cdw; i++) { 562 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); 563 } 564#endif 565 566} 567 568 569/** 570 * Emit function for r600_cs_shader_state atom 571 */ 572void evergreen_emit_cs_shader(struct r600_context *rctx, 573 struct r600_atom *atom) 574{ 575 struct r600_cs_shader_state *state = 576 (struct r600_cs_shader_state*)atom; 577 struct r600_pipe_compute *shader = state->shader; 578 struct radeon_winsys_cs *cs = rctx->b.gfx.cs; 579 uint64_t va; 580 struct r600_resource *code_bo; 581 unsigned ngpr, nstack; 582 583 code_bo = shader->code_bo; 584 va = shader->code_bo->gpu_address + state->pc; 585 ngpr = shader->bc.ngpr; 586 nstack = shader->bc.nstack; 587 588 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); 589 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ 590 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ 591 S_0288D4_NUM_GPRS(ngpr) 592 | S_0288D4_STACK_SIZE(nstack)); 593 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ 594 595 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); 596 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, 597 code_bo, RADEON_USAGE_READ, 598 RADEON_PRIO_USER_SHADER)); 599} 600 601static void evergreen_launch_grid(struct pipe_context *ctx_, 602 const struct pipe_grid_info *info) 603{ 604 struct r600_context *rctx = (struct r600_context *)ctx_; 605#ifdef HAVE_OPENCL 606 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 607 boolean use_kill; 608 609 rctx->cs_shader_state.pc = info->pc; 610 /* Get the config information for this kernel. */ 611 r600_shader_binary_read_config(&shader->binary, &shader->bc, 612 info->pc, &use_kill); 613#endif 614 615 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc); 616 617 618 evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input); 619 compute_emit_cs(rctx, info->block, info->grid); 620} 621 622static void evergreen_set_compute_resources(struct pipe_context *ctx_, 623 unsigned start, unsigned count, 624 struct pipe_surface **surfaces) 625{ 626 struct r600_context *rctx = (struct r600_context *)ctx_; 627 struct r600_surface **resources = (struct r600_surface **)surfaces; 628 629 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n", 630 start, count); 631 632 for (unsigned i = 0; i < count; i++) { 633 /* The First two vertex buffers are reserved for parameters and 634 * global buffers. */ 635 unsigned vtx_id = 2 + i; 636 if (resources[i]) { 637 struct r600_resource_global *buffer = 638 (struct r600_resource_global*) 639 resources[i]->base.texture; 640 if (resources[i]->base.writable) { 641 assert(i+1 < 12); 642 643 evergreen_set_rat(rctx->cs_shader_state.shader, i+1, 644 (struct r600_resource *)resources[i]->base.texture, 645 buffer->chunk->start_in_dw*4, 646 resources[i]->base.texture->width0); 647 } 648 649 evergreen_cs_set_vertex_buffer(rctx, vtx_id, 650 buffer->chunk->start_in_dw * 4, 651 resources[i]->base.texture); 652 } 653 } 654} 655 656static void evergreen_set_global_binding(struct pipe_context *ctx_, 657 unsigned first, unsigned n, 658 struct pipe_resource **resources, 659 uint32_t **handles) 660{ 661 struct r600_context *rctx = (struct r600_context *)ctx_; 662 struct compute_memory_pool *pool = rctx->screen->global_pool; 663 struct r600_resource_global **buffers = 664 (struct r600_resource_global **)resources; 665 unsigned i; 666 667 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n", 668 first, n); 669 670 if (!resources) { 671 /* XXX: Unset */ 672 return; 673 } 674 675 /* We mark these items for promotion to the pool if they 676 * aren't already there */ 677 for (i = first; i < first + n; i++) { 678 struct compute_memory_item *item = buffers[i]->chunk; 679 680 if (!is_item_in_pool(item)) 681 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING; 682 } 683 684 if (compute_memory_finalize_pending(pool, ctx_) == -1) { 685 /* XXX: Unset */ 686 return; 687 } 688 689 for (i = first; i < first + n; i++) 690 { 691 uint32_t buffer_offset; 692 uint32_t handle; 693 assert(resources[i]->target == PIPE_BUFFER); 694 assert(resources[i]->bind & PIPE_BIND_GLOBAL); 695 696 buffer_offset = util_le32_to_cpu(*(handles[i])); 697 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4; 698 699 *(handles[i]) = util_cpu_to_le32(handle); 700 } 701 702 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4); 703 evergreen_cs_set_vertex_buffer(rctx, 1, 0, 704 (struct pipe_resource*)pool->bo); 705} 706 707/** 708 * This function initializes all the compute specific registers that need to 709 * be initialized for each compute command stream. Registers that are common 710 * to both compute and 3D will be initialized at the beginning of each compute 711 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG 712 * packet requires that the shader type bit be set, we must initialize all 713 * context registers needed for compute in this function. The registers 714 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the 715 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending 716 * on the GPU family. 717 */ 718void evergreen_init_atom_start_compute_cs(struct r600_context *rctx) 719{ 720 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd; 721 int num_threads; 722 int num_stack_entries; 723 724 /* since all required registers are initialized in the 725 * start_compute_cs_cmd atom, we can EMIT_EARLY here. 726 */ 727 r600_init_command_buffer(cb, 256); 728 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; 729 730 /* This must be first. */ 731 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 732 r600_store_value(cb, 0x80000000); 733 r600_store_value(cb, 0x80000000); 734 735 /* We're setting config registers here. */ 736 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); 737 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 738 739 switch (rctx->b.family) { 740 case CHIP_CEDAR: 741 default: 742 num_threads = 128; 743 num_stack_entries = 256; 744 break; 745 case CHIP_REDWOOD: 746 num_threads = 128; 747 num_stack_entries = 256; 748 break; 749 case CHIP_JUNIPER: 750 num_threads = 128; 751 num_stack_entries = 512; 752 break; 753 case CHIP_CYPRESS: 754 case CHIP_HEMLOCK: 755 num_threads = 128; 756 num_stack_entries = 512; 757 break; 758 case CHIP_PALM: 759 num_threads = 128; 760 num_stack_entries = 256; 761 break; 762 case CHIP_SUMO: 763 num_threads = 128; 764 num_stack_entries = 256; 765 break; 766 case CHIP_SUMO2: 767 num_threads = 128; 768 num_stack_entries = 512; 769 break; 770 case CHIP_BARTS: 771 num_threads = 128; 772 num_stack_entries = 512; 773 break; 774 case CHIP_TURKS: 775 num_threads = 128; 776 num_stack_entries = 256; 777 break; 778 case CHIP_CAICOS: 779 num_threads = 128; 780 num_stack_entries = 256; 781 break; 782 } 783 784 /* Config Registers */ 785 if (rctx->b.chip_class < CAYMAN) 786 evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family, 787 rctx->screen->b.info.drm_minor); 788 else 789 cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family, 790 rctx->screen->b.info.drm_minor); 791 792 /* The primitive type always needs to be POINTLIST for compute. */ 793 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, 794 V_008958_DI_PT_POINTLIST); 795 796 if (rctx->b.chip_class < CAYMAN) { 797 798 /* These registers control which simds can be used by each stage. 799 * The default for these registers is 0xffffffff, which means 800 * all simds are available for each stage. It's possible we may 801 * want to play around with these in the future, but for now 802 * the default value is fine. 803 * 804 * R_008E20_SQ_STATIC_THREAD_MGMT1 805 * R_008E24_SQ_STATIC_THREAD_MGMT2 806 * R_008E28_SQ_STATIC_THREAD_MGMT3 807 */ 808 809 /* XXX: We may need to adjust the thread and stack resource 810 * values for 3D/compute interop */ 811 812 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); 813 814 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 815 * Set the number of threads used by the PS/VS/GS/ES stage to 816 * 0. 817 */ 818 r600_store_value(cb, 0); 819 820 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 821 * Set the number of threads used by the CS (aka LS) stage to 822 * the maximum number of threads and set the number of threads 823 * for the HS stage to 0. */ 824 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); 825 826 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 827 * Set the Control Flow stack entries to 0 for PS/VS stages */ 828 r600_store_value(cb, 0); 829 830 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 831 * Set the Control Flow stack entries to 0 for GS/ES stages */ 832 r600_store_value(cb, 0); 833 834 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 835 * Set the Contol Flow stack entries to 0 for the HS stage, and 836 * set it to the maximum value for the CS (aka LS) stage. */ 837 r600_store_value(cb, 838 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); 839 } 840 /* Give the compute shader all the available LDS space. 841 * NOTE: This only sets the maximum number of dwords that a compute 842 * shader can allocate. When a shader is executed, we still need to 843 * allocate the appropriate amount of LDS dwords using the 844 * CM_R_0288E8_SQ_LDS_ALLOC register. 845 */ 846 if (rctx->b.chip_class < CAYMAN) { 847 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, 848 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192)); 849 } else { 850 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT, 851 S_0286FC_NUM_PS_LDS(0) | 852 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */ 853 } 854 855 /* Context Registers */ 856 857 if (rctx->b.chip_class < CAYMAN) { 858 /* workaround for hw issues with dyn gpr - must set all limits 859 * to 240 instead of 0, 0x1e == 240 / 8 860 */ 861 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, 862 S_028838_PS_GPRS(0x1e) | 863 S_028838_VS_GPRS(0x1e) | 864 S_028838_GS_GPRS(0x1e) | 865 S_028838_ES_GPRS(0x1e) | 866 S_028838_HS_GPRS(0x1e) | 867 S_028838_LS_GPRS(0x1e)); 868 } 869 870 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ 871 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, 872 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); 873 874 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); 875 876 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, 877 S_0286E8_TID_IN_GROUP_ENA 878 | S_0286E8_TGID_ENA 879 | S_0286E8_DISABLE_INDEX_PACK) 880 ; 881 882 /* The LOOP_CONST registers are an optimizations for loops that allows 883 * you to store the initial counter, increment value, and maximum 884 * counter value in a register so that hardware can calculate the 885 * correct number of iterations for the loop, so that you don't need 886 * to have the loop counter in your shader code. We don't currently use 887 * this optimization, so we must keep track of the counter in the 888 * shader and use a break instruction to exit loops. However, the 889 * hardware will still uses this register to determine when to exit a 890 * loop, so we need to initialize the counter to 0, set the increment 891 * value to 1 and the maximum counter value to the 4095 (0xfff) which 892 * is the maximum value allowed. This gives us a maximum of 4096 893 * iterations for our loops, but hopefully our break instruction will 894 * execute before some time before the 4096th iteration. 895 */ 896 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); 897} 898 899void evergreen_init_compute_state_functions(struct r600_context *rctx) 900{ 901 rctx->b.b.create_compute_state = evergreen_create_compute_state; 902 rctx->b.b.delete_compute_state = evergreen_delete_compute_state; 903 rctx->b.b.bind_compute_state = evergreen_bind_compute_state; 904// rctx->context.create_sampler_view = evergreen_compute_create_sampler_view; 905 rctx->b.b.set_compute_resources = evergreen_set_compute_resources; 906 rctx->b.b.set_global_binding = evergreen_set_global_binding; 907 rctx->b.b.launch_grid = evergreen_launch_grid; 908 909} 910 911struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, 912 const struct pipe_resource *templ) 913{ 914 struct r600_resource_global* result = NULL; 915 struct r600_screen* rscreen = NULL; 916 int size_in_dw = 0; 917 918 assert(templ->target == PIPE_BUFFER); 919 assert(templ->bind & PIPE_BIND_GLOBAL); 920 assert(templ->array_size == 1 || templ->array_size == 0); 921 assert(templ->depth0 == 1 || templ->depth0 == 0); 922 assert(templ->height0 == 1 || templ->height0 == 0); 923 924 result = (struct r600_resource_global*) 925 CALLOC(sizeof(struct r600_resource_global), 1); 926 rscreen = (struct r600_screen*)screen; 927 928 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n"); 929 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0, 930 templ->array_size); 931 932 result->base.b.vtbl = &r600_global_buffer_vtbl; 933 result->base.b.b = *templ; 934 result->base.b.b.screen = screen; 935 pipe_reference_init(&result->base.b.b.reference, 1); 936 937 size_in_dw = (templ->width0+3) / 4; 938 939 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw); 940 941 if (result->chunk == NULL) 942 { 943 free(result); 944 return NULL; 945 } 946 947 return &result->base.b.b; 948} 949 950void r600_compute_global_buffer_destroy(struct pipe_screen *screen, 951 struct pipe_resource *res) 952{ 953 struct r600_resource_global* buffer = NULL; 954 struct r600_screen* rscreen = NULL; 955 956 assert(res->target == PIPE_BUFFER); 957 assert(res->bind & PIPE_BIND_GLOBAL); 958 959 buffer = (struct r600_resource_global*)res; 960 rscreen = (struct r600_screen*)screen; 961 962 compute_memory_free(rscreen->global_pool, buffer->chunk->id); 963 964 buffer->chunk = NULL; 965 free(res); 966} 967 968void *r600_compute_global_transfer_map(struct pipe_context *ctx_, 969 struct pipe_resource *resource, 970 unsigned level, 971 unsigned usage, 972 const struct pipe_box *box, 973 struct pipe_transfer **ptransfer) 974{ 975 struct r600_context *rctx = (struct r600_context*)ctx_; 976 struct compute_memory_pool *pool = rctx->screen->global_pool; 977 struct r600_resource_global* buffer = 978 (struct r600_resource_global*)resource; 979 980 struct compute_memory_item *item = buffer->chunk; 981 struct pipe_resource *dst = NULL; 982 unsigned offset = box->x; 983 984 if (is_item_in_pool(item)) { 985 compute_memory_demote_item(pool, item, ctx_); 986 } 987 else { 988 if (item->real_buffer == NULL) { 989 item->real_buffer = 990 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4); 991 } 992 } 993 994 dst = (struct pipe_resource*)item->real_buffer; 995 996 if (usage & PIPE_TRANSFER_READ) 997 buffer->chunk->status |= ITEM_MAPPED_FOR_READING; 998 999 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n" 1000 "level = %u, usage = %u, box(x = %u, y = %u, z = %u " 1001 "width = %u, height = %u, depth = %u)\n", level, usage, 1002 box->x, box->y, box->z, box->width, box->height, 1003 box->depth); 1004 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = " 1005 "%u (box.x)\n", item->id, box->x); 1006 1007 1008 assert(resource->target == PIPE_BUFFER); 1009 assert(resource->bind & PIPE_BIND_GLOBAL); 1010 assert(box->x >= 0); 1011 assert(box->y == 0); 1012 assert(box->z == 0); 1013 1014 ///TODO: do it better, mapping is not possible if the pool is too big 1015 return pipe_buffer_map_range(ctx_, dst, 1016 offset, box->width, usage, ptransfer); 1017} 1018 1019void r600_compute_global_transfer_unmap(struct pipe_context *ctx_, 1020 struct pipe_transfer *transfer) 1021{ 1022 /* struct r600_resource_global are not real resources, they just map 1023 * to an offset within the compute memory pool. The function 1024 * r600_compute_global_transfer_map() maps the memory pool 1025 * resource rather than the struct r600_resource_global passed to 1026 * it as an argument and then initalizes ptransfer->resource with 1027 * the memory pool resource (via pipe_buffer_map_range). 1028 * When transfer_unmap is called it uses the memory pool's 1029 * vtable which calls r600_buffer_transfer_map() rather than 1030 * this function. 1031 */ 1032 assert (!"This function should not be called"); 1033} 1034 1035void r600_compute_global_transfer_flush_region(struct pipe_context *ctx_, 1036 struct pipe_transfer *transfer, 1037 const struct pipe_box *box) 1038{ 1039 assert(0 && "TODO"); 1040} 1041 1042void r600_compute_global_transfer_inline_write(struct pipe_context *pipe, 1043 struct pipe_resource *resource, 1044 unsigned level, 1045 unsigned usage, 1046 const struct pipe_box *box, 1047 const void *data, 1048 unsigned stride, 1049 unsigned layer_stride) 1050{ 1051 assert(0 && "TODO"); 1052} 1053