evergreen_compute.c revision ec7d775790bef929b15e4c82d68ccaaf92c9f6b7
1/* 2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Adam Rak <adam.rak@streamnovation.com> 25 */ 26 27#include <stdio.h> 28#include <errno.h> 29#include "pipe/p_defines.h" 30#include "pipe/p_state.h" 31#include "pipe/p_context.h" 32#include "util/u_blitter.h" 33#include "util/u_double_list.h" 34#include "util/u_transfer.h" 35#include "util/u_surface.h" 36#include "util/u_pack_color.h" 37#include "util/u_memory.h" 38#include "util/u_inlines.h" 39#include "util/u_framebuffer.h" 40#include "pipebuffer/pb_buffer.h" 41#include "r600.h" 42#include "evergreend.h" 43#include "r600_resource.h" 44#include "r600_shader.h" 45#include "r600_pipe.h" 46#include "r600_formats.h" 47#include "evergreen_compute.h" 48#include "evergreen_compute_internal.h" 49#include "compute_memory_pool.h" 50#ifdef HAVE_OPENCL 51#include "llvm_wrapper.h" 52#endif 53 54/** 55RAT0 is for global binding write 56VTX1 is for global binding read 57 58for wrting images RAT1... 59for reading images TEX2... 60 TEX2-RAT1 is paired 61 62TEX2... consumes the same fetch resources, that VTX2... would consume 63 64CONST0 and VTX0 is for parameters 65 CONST0 is binding smaller input parameter buffer, and for constant indexing, 66 also constant cached 67 VTX0 is for indirect/non-constant indexing, or if the input is bigger than 68 the constant cache can handle 69 70RAT-s are limited to 12, so we can only bind at most 11 texture for writing 71because we reserve RAT0 for global bindings. With byteaddressing enabled, 72we should reserve another one too.=> 10 image binding for writing max. 73 74from Nvidia OpenCL: 75 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 76 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 77 78so 10 for writing is enough. 176 is the max for reading according to the docs 79 80writable images should be listed first < 10, so their id corresponds to RAT(id+1) 81writable images will consume TEX slots, VTX slots too because of linear indexing 82 83*/ 84 85static void evergreen_cs_set_vertex_buffer( 86 struct r600_context * rctx, 87 unsigned vb_index, 88 unsigned offset, 89 struct pipe_resource * buffer) 90{ 91 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state; 92 struct pipe_vertex_buffer *vb = &state->vb[vb_index]; 93 vb->stride = 1; 94 vb->buffer_offset = offset; 95 vb->buffer = buffer; 96 vb->user_buffer = NULL; 97 98 /* The vertex instructions in the compute shaders use the texture cache, 99 * so we need to invalidate it. */ 100 rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; 101 state->enabled_mask |= 1 << vb_index; 102 state->dirty_mask |= 1 << vb_index; 103 state->atom.dirty = true; 104} 105 106static const struct u_resource_vtbl r600_global_buffer_vtbl = 107{ 108 u_default_resource_get_handle, /* get_handle */ 109 r600_compute_global_buffer_destroy, /* resource_destroy */ 110 r600_compute_global_transfer_map, /* transfer_map */ 111 r600_compute_global_transfer_flush_region,/* transfer_flush_region */ 112 r600_compute_global_transfer_unmap, /* transfer_unmap */ 113 r600_compute_global_transfer_inline_write /* transfer_inline_write */ 114}; 115 116 117void *evergreen_create_compute_state( 118 struct pipe_context *ctx_, 119 const const struct pipe_compute_state *cso) 120{ 121 struct r600_context *ctx = (struct r600_context *)ctx_; 122 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); 123 124#ifdef HAVE_OPENCL 125 const struct pipe_llvm_program_header * header; 126 const unsigned char * code; 127 unsigned i; 128 129 COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n"); 130 131 header = cso->prog; 132 code = cso->prog + sizeof(struct pipe_llvm_program_header); 133#endif 134 135 shader->ctx = (struct r600_context*)ctx; 136 shader->resources = (struct evergreen_compute_resource*) 137 CALLOC(sizeof(struct evergreen_compute_resource), 138 get_compute_resource_num()); 139 shader->local_size = cso->req_local_mem; ///TODO: assert it 140 shader->private_size = cso->req_private_mem; 141 shader->input_size = cso->req_input_mem; 142 143#ifdef HAVE_OPENCL 144 shader->num_kernels = llvm_get_num_kernels(code, header->num_bytes); 145 shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels); 146 147 for (i = 0; i < shader->num_kernels; i++) { 148 struct r600_kernel *kernel = &shader->kernels[i]; 149 kernel->llvm_module = llvm_get_kernel_module(i, code, 150 header->num_bytes); 151 } 152#endif 153 return shader; 154} 155 156void evergreen_delete_compute_state(struct pipe_context *ctx, void* state) 157{ 158 struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state; 159 160 free(shader->resources); 161 free(shader); 162} 163 164static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state) 165{ 166 struct r600_context *ctx = (struct r600_context *)ctx_; 167 168 COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n"); 169 170 ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state; 171} 172 173/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit 174 * kernel parameters there are inplicit parameters that need to be stored 175 * in the vertex buffer as well. Here is how these parameters are organized in 176 * the buffer: 177 * 178 * DWORDS 0-2: Number of work groups in each dimension (x,y,z) 179 * DWORDS 3-5: Number of global work items in each dimension (x,y,z) 180 * DWORDS 6-8: Number of work items within each work group in each dimension 181 * (x,y,z) 182 * DWORDS 9+ : Kernel parameters 183 */ 184void evergreen_compute_upload_input( 185 struct pipe_context *ctx_, 186 const uint *block_layout, 187 const uint *grid_layout, 188 const void *input) 189{ 190 struct r600_context *ctx = (struct r600_context *)ctx_; 191 struct r600_pipe_compute *shader = ctx->cs_shader_state.shader; 192 int i; 193 unsigned kernel_parameters_offset_bytes = 36; 194 uint32_t * num_work_groups_start; 195 uint32_t * global_size_start; 196 uint32_t * local_size_start; 197 uint32_t * kernel_parameters_start; 198 199 if (shader->input_size == 0) { 200 return; 201 } 202 203 if (!shader->kernel_param) { 204 unsigned buffer_size = shader->input_size; 205 206 /* Add space for the grid dimensions */ 207 buffer_size += kernel_parameters_offset_bytes * sizeof(uint); 208 shader->kernel_param = r600_compute_buffer_alloc_vram( 209 ctx->screen, buffer_size); 210 } 211 212 num_work_groups_start = r600_buffer_mmap_sync_with_rings(ctx, shader->kernel_param, PIPE_TRANSFER_WRITE); 213 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4)); 214 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4); 215 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4); 216 217 /* Copy the work group size */ 218 memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint)); 219 220 /* Copy the global size */ 221 for (i = 0; i < 3; i++) { 222 global_size_start[i] = grid_layout[i] * block_layout[i]; 223 } 224 225 /* Copy the local dimensions */ 226 memcpy(local_size_start, block_layout, 3 * sizeof(uint)); 227 228 /* Copy the kernel inputs */ 229 memcpy(kernel_parameters_start, input, shader->input_size); 230 231 for (i = 0; i < (kernel_parameters_offset_bytes / 4) + 232 (shader->input_size / 4); i++) { 233 COMPUTE_DBG(ctx->screen, "input %i : %i\n", i, 234 ((unsigned*)num_work_groups_start)[i]); 235 } 236 237 ctx->ws->buffer_unmap(shader->kernel_param->cs_buf); 238 239 ///ID=0 is reserved for the parameters 240 evergreen_cs_set_vertex_buffer(ctx, 0, 0, 241 (struct pipe_resource*)shader->kernel_param); 242 ///ID=0 is reserved for parameters 243 evergreen_set_const_cache(shader, 0, shader->kernel_param, 244 shader->input_size, 0); 245} 246 247static void evergreen_emit_direct_dispatch( 248 struct r600_context *rctx, 249 const uint *block_layout, const uint *grid_layout) 250{ 251 int i; 252 struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; 253 unsigned num_waves; 254 unsigned num_pipes = rctx->screen->info.r600_max_pipes; 255 unsigned wave_divisor = (16 * num_pipes); 256 int group_size = 1; 257 int grid_size = 1; 258 /* XXX: Enable lds and get size from cs_shader_state */ 259 unsigned lds_size = 0; 260 261 /* Calculate group_size/grid_size */ 262 for (i = 0; i < 3; i++) { 263 group_size *= block_layout[i]; 264 } 265 266 for (i = 0; i < 3; i++) { 267 grid_size *= grid_layout[i]; 268 } 269 270 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ 271 num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + 272 wave_divisor - 1) / wave_divisor; 273 274 COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n", 275 num_pipes, num_waves); 276 277 /* XXX: Partition the LDS between PS/CS. By default half (4096 dwords 278 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders. 279 * We may need to allocat the entire LDS space for Compute Shaders. 280 * 281 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords) 282 * CM: CM_R_0286FC_SPI_LDS_MGMT := S_0286FC_NUM_LS_LDS(lds_dwords) 283 */ 284 285 r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); 286 287 r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); 288 r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ 289 r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ 290 r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ 291 292 r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, 293 group_size); 294 295 r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); 296 r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ 297 r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ 298 r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ 299 300 r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC, 301 lds_size | (num_waves << 14)); 302 303 /* Dispatch packet */ 304 r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0)); 305 r600_write_value(cs, grid_layout[0]); 306 r600_write_value(cs, grid_layout[1]); 307 r600_write_value(cs, grid_layout[2]); 308 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ 309 r600_write_value(cs, 1); 310} 311 312static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, 313 const uint *grid_layout) 314{ 315 struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; 316 unsigned flush_flags = 0; 317 int i; 318 struct r600_resource *onebo = NULL; 319 struct evergreen_compute_resource *resources = 320 ctx->cs_shader_state.shader->resources; 321 322 /* make sure that the gfx ring is only one active */ 323 if (ctx->rings.dma.cs) { 324 ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); 325 } 326 327 /* Initialize all the compute-related registers. 328 * 329 * See evergreen_init_atom_start_compute_cs() in this file for the list 330 * of registers initialized by the start_compute_cs_cmd atom. 331 */ 332 r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd); 333 334 ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; 335 r600_flush_emit(ctx); 336 337 /* Emit colorbuffers. */ 338 for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) { 339 struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; 340 unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx, 341 (struct r600_resource*)cb->base.texture, 342 RADEON_USAGE_READWRITE); 343 344 r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); 345 r600_write_value(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ 346 r600_write_value(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ 347 r600_write_value(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ 348 r600_write_value(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ 349 r600_write_value(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ 350 r600_write_value(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ 351 r600_write_value(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ 352 353 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ 354 r600_write_value(cs, reloc); 355 356 if (!ctx->keep_tiling_flags) { 357 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */ 358 r600_write_value(cs, reloc); 359 } 360 361 r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ 362 r600_write_value(cs, reloc); 363 } 364 365 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ 366 r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK, 367 ctx->compute_cb_target_mask); 368 369 370 /* Emit vertex buffer state */ 371 ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask); 372 r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom); 373 374 /* Emit compute shader state */ 375 r600_emit_atom(ctx, &ctx->cs_shader_state.atom); 376 377 for (i = 0; i < get_compute_resource_num(); i++) { 378 if (resources[i].enabled) { 379 int j; 380 COMPUTE_DBG(ctx->screen, "resnum: %i, cdw: %i\n", i, cs->cdw); 381 382 for (j = 0; j < resources[i].cs_end; j++) { 383 if (resources[i].do_reloc[j]) { 384 assert(resources[i].bo); 385 evergreen_emit_ctx_reloc(ctx, 386 resources[i].bo, 387 resources[i].usage); 388 } 389 390 cs->buf[cs->cdw++] = resources[i].cs[j]; 391 } 392 393 if (resources[i].bo) { 394 onebo = resources[i].bo; 395 evergreen_emit_ctx_reloc(ctx, 396 resources[i].bo, 397 resources[i].usage); 398 399 ///special case for textures 400 if (resources[i].do_reloc 401 [resources[i].cs_end] == 2) { 402 evergreen_emit_ctx_reloc(ctx, 403 resources[i].bo, 404 resources[i].usage); 405 } 406 } 407 } 408 } 409 410 /* Emit dispatch state and dispatch packet */ 411 evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout); 412 413 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff 414 */ 415 ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; 416 r600_flush_emit(ctx); 417 418#if 0 419 COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw); 420 for (i = 0; i < cs->cdw; i++) { 421 COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]); 422 } 423#endif 424 425 flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE; 426 if (ctx->keep_tiling_flags) { 427 flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; 428 } 429 430 ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags); 431 432 ctx->flags = 0; 433 434 COMPUTE_DBG(ctx->screen, "shader started\n"); 435 436 ctx->ws->buffer_wait(onebo->buf, 0); 437 438 COMPUTE_DBG(ctx->screen, "...\n"); 439} 440 441 442/** 443 * Emit function for r600_cs_shader_state atom 444 */ 445void evergreen_emit_cs_shader( 446 struct r600_context *rctx, 447 struct r600_atom *atom) 448{ 449 struct r600_cs_shader_state *state = 450 (struct r600_cs_shader_state*)atom; 451 struct r600_pipe_compute *shader = state->shader; 452 struct r600_kernel *kernel = &shader->kernels[state->kernel_index]; 453 struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; 454 uint64_t va; 455 456 va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b); 457 458 r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); 459 r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ 460 r600_write_value(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ 461 S_0288D4_NUM_GPRS(kernel->bc.ngpr) 462 | S_0288D4_STACK_SIZE(kernel->bc.nstack)); 463 r600_write_value(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ 464 465 r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0)); 466 r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx, 467 kernel->code_bo, RADEON_USAGE_READ)); 468 469 rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; 470} 471 472static void evergreen_launch_grid( 473 struct pipe_context *ctx_, 474 const uint *block_layout, const uint *grid_layout, 475 uint32_t pc, const void *input) 476{ 477 struct r600_context *ctx = (struct r600_context *)ctx_; 478 479#ifdef HAVE_OPENCL 480 COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc); 481 482 struct r600_pipe_compute *shader = ctx->cs_shader_state.shader; 483 if (!shader->kernels[pc].code_bo) { 484 void *p; 485 struct r600_kernel *kernel = &shader->kernels[pc]; 486 r600_compute_shader_create(ctx_, kernel->llvm_module, &kernel->bc); 487 kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen, 488 kernel->bc.ndw * 4); 489 p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE); 490 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4); 491 ctx->ws->buffer_unmap(kernel->code_bo->cs_buf); 492 } 493#endif 494 495 ctx->cs_shader_state.kernel_index = pc; 496 evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input); 497 compute_emit_cs(ctx, block_layout, grid_layout); 498} 499 500static void evergreen_set_compute_resources(struct pipe_context * ctx_, 501 unsigned start, unsigned count, 502 struct pipe_surface ** surfaces) 503{ 504 struct r600_context *ctx = (struct r600_context *)ctx_; 505 struct r600_surface **resources = (struct r600_surface **)surfaces; 506 507 COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n", 508 start, count); 509 510 for (int i = 0; i < count; i++) { 511 /* The First two vertex buffers are reserved for parameters and 512 * global buffers. */ 513 unsigned vtx_id = 2 + i; 514 if (resources[i]) { 515 struct r600_resource_global *buffer = 516 (struct r600_resource_global*) 517 resources[i]->base.texture; 518 if (resources[i]->base.writable) { 519 assert(i+1 < 12); 520 521 evergreen_set_rat(ctx->cs_shader_state.shader, i+1, 522 (struct r600_resource *)resources[i]->base.texture, 523 buffer->chunk->start_in_dw*4, 524 resources[i]->base.texture->width0); 525 } 526 527 evergreen_cs_set_vertex_buffer(ctx, vtx_id, 528 buffer->chunk->start_in_dw * 4, 529 resources[i]->base.texture); 530 } 531 } 532} 533 534static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_, 535 unsigned start_slot, unsigned count, 536 struct pipe_sampler_view **views) 537{ 538 struct r600_context *ctx = (struct r600_context *)ctx_; 539 struct r600_pipe_sampler_view **resource = 540 (struct r600_pipe_sampler_view **)views; 541 542 for (int i = 0; i < count; i++) { 543 if (resource[i]) { 544 assert(i+1 < 12); 545 ///FETCH0 = VTX0 (param buffer), 546 //FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX 547 evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2); 548 } 549 } 550} 551 552static void evergreen_bind_compute_sampler_states( 553 struct pipe_context *ctx_, 554 unsigned start_slot, 555 unsigned num_samplers, 556 void **samplers_) 557{ 558 struct r600_context *ctx = (struct r600_context *)ctx_; 559 struct compute_sampler_state ** samplers = 560 (struct compute_sampler_state **)samplers_; 561 562 for (int i = 0; i < num_samplers; i++) { 563 if (samplers[i]) { 564 evergreen_set_sampler_resource( 565 ctx->cs_shader_state.shader, samplers[i], i); 566 } 567 } 568} 569 570static void evergreen_set_global_binding( 571 struct pipe_context *ctx_, unsigned first, unsigned n, 572 struct pipe_resource **resources, 573 uint32_t **handles) 574{ 575 struct r600_context *ctx = (struct r600_context *)ctx_; 576 struct compute_memory_pool *pool = ctx->screen->global_pool; 577 struct r600_resource_global **buffers = 578 (struct r600_resource_global **)resources; 579 580 COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n", 581 first, n); 582 583 if (!resources) { 584 /* XXX: Unset */ 585 return; 586 } 587 588 compute_memory_finalize_pending(pool, ctx_); 589 590 for (int i = 0; i < n; i++) 591 { 592 assert(resources[i]->target == PIPE_BUFFER); 593 assert(resources[i]->bind & PIPE_BIND_GLOBAL); 594 595 *(handles[i]) = buffers[i]->chunk->start_in_dw * 4; 596 } 597 598 evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4); 599 evergreen_cs_set_vertex_buffer(ctx, 1, 0, 600 (struct pipe_resource*)pool->bo); 601} 602 603/** 604 * This function initializes all the compute specific registers that need to 605 * be initialized for each compute command stream. Registers that are common 606 * to both compute and 3D will be initialized at the beginning of each compute 607 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG 608 * packet requires that the shader type bit be set, we must initialize all 609 * context registers needed for compute in this function. The registers 610 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the 611 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending 612 * on the GPU family. 613 */ 614void evergreen_init_atom_start_compute_cs(struct r600_context *ctx) 615{ 616 struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd; 617 int num_threads; 618 int num_stack_entries; 619 620 /* since all required registers are initialised in the 621 * start_compute_cs_cmd atom, we can EMIT_EARLY here. 622 */ 623 r600_init_command_buffer(cb, 256); 624 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; 625 626 /* This must be first. */ 627 r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 628 r600_store_value(cb, 0x80000000); 629 r600_store_value(cb, 0x80000000); 630 631 /* We're setting config registers here. */ 632 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); 633 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 634 635 switch (ctx->family) { 636 case CHIP_CEDAR: 637 default: 638 num_threads = 128; 639 num_stack_entries = 256; 640 break; 641 case CHIP_REDWOOD: 642 num_threads = 128; 643 num_stack_entries = 256; 644 break; 645 case CHIP_JUNIPER: 646 num_threads = 128; 647 num_stack_entries = 512; 648 break; 649 case CHIP_CYPRESS: 650 case CHIP_HEMLOCK: 651 num_threads = 128; 652 num_stack_entries = 512; 653 break; 654 case CHIP_PALM: 655 num_threads = 128; 656 num_stack_entries = 256; 657 break; 658 case CHIP_SUMO: 659 num_threads = 128; 660 num_stack_entries = 256; 661 break; 662 case CHIP_SUMO2: 663 num_threads = 128; 664 num_stack_entries = 512; 665 break; 666 case CHIP_BARTS: 667 num_threads = 128; 668 num_stack_entries = 512; 669 break; 670 case CHIP_TURKS: 671 num_threads = 128; 672 num_stack_entries = 256; 673 break; 674 case CHIP_CAICOS: 675 num_threads = 128; 676 num_stack_entries = 256; 677 break; 678 } 679 680 /* Config Registers */ 681 if (ctx->chip_class < CAYMAN) 682 evergreen_init_common_regs(cb, ctx->chip_class, ctx->family, 683 ctx->screen->info.drm_minor); 684 else 685 cayman_init_common_regs(cb, ctx->chip_class, ctx->family, 686 ctx->screen->info.drm_minor); 687 688 /* The primitive type always needs to be POINTLIST for compute. */ 689 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, 690 V_008958_DI_PT_POINTLIST); 691 692 if (ctx->chip_class < CAYMAN) { 693 694 /* These registers control which simds can be used by each stage. 695 * The default for these registers is 0xffffffff, which means 696 * all simds are available for each stage. It's possible we may 697 * want to play around with these in the future, but for now 698 * the default value is fine. 699 * 700 * R_008E20_SQ_STATIC_THREAD_MGMT1 701 * R_008E24_SQ_STATIC_THREAD_MGMT2 702 * R_008E28_SQ_STATIC_THREAD_MGMT3 703 */ 704 705 /* XXX: We may need to adjust the thread and stack resouce 706 * values for 3D/compute interop */ 707 708 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); 709 710 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 711 * Set the number of threads used by the PS/VS/GS/ES stage to 712 * 0. 713 */ 714 r600_store_value(cb, 0); 715 716 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 717 * Set the number of threads used by the CS (aka LS) stage to 718 * the maximum number of threads and set the number of threads 719 * for the HS stage to 0. */ 720 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); 721 722 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 723 * Set the Control Flow stack entries to 0 for PS/VS stages */ 724 r600_store_value(cb, 0); 725 726 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 727 * Set the Control Flow stack entries to 0 for GS/ES stages */ 728 r600_store_value(cb, 0); 729 730 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 731 * Set the Contol Flow stack entries to 0 for the HS stage, and 732 * set it to the maximum value for the CS (aka LS) stage. */ 733 r600_store_value(cb, 734 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); 735 } 736 737 /* Context Registers */ 738 739 if (ctx->chip_class < CAYMAN) { 740 /* workaround for hw issues with dyn gpr - must set all limits 741 * to 240 instead of 0, 0x1e == 240 / 8 742 */ 743 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, 744 S_028838_PS_GPRS(0x1e) | 745 S_028838_VS_GPRS(0x1e) | 746 S_028838_GS_GPRS(0x1e) | 747 S_028838_ES_GPRS(0x1e) | 748 S_028838_HS_GPRS(0x1e) | 749 S_028838_LS_GPRS(0x1e)); 750 } 751 752 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ 753 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, 754 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); 755 756 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); 757 758 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, 759 S_0286E8_TID_IN_GROUP_ENA 760 | S_0286E8_TGID_ENA 761 | S_0286E8_DISABLE_INDEX_PACK) 762 ; 763 764 /* The LOOP_CONST registers are an optimizations for loops that allows 765 * you to store the initial counter, increment value, and maximum 766 * counter value in a register so that hardware can calculate the 767 * correct number of iterations for the loop, so that you don't need 768 * to have the loop counter in your shader code. We don't currently use 769 * this optimization, so we must keep track of the counter in the 770 * shader and use a break instruction to exit loops. However, the 771 * hardware will still uses this register to determine when to exit a 772 * loop, so we need to initialize the counter to 0, set the increment 773 * value to 1 and the maximum counter value to the 4095 (0xfff) which 774 * is the maximum value allowed. This gives us a maximum of 4096 775 * iterations for our loops, but hopefully our break instruction will 776 * execute before some time before the 4096th iteration. 777 */ 778 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); 779} 780 781void evergreen_init_compute_state_functions(struct r600_context *ctx) 782{ 783 ctx->context.create_compute_state = evergreen_create_compute_state; 784 ctx->context.delete_compute_state = evergreen_delete_compute_state; 785 ctx->context.bind_compute_state = evergreen_bind_compute_state; 786// ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; 787 ctx->context.set_compute_resources = evergreen_set_compute_resources; 788 ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view; 789 ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states; 790 ctx->context.set_global_binding = evergreen_set_global_binding; 791 ctx->context.launch_grid = evergreen_launch_grid; 792 793 /* We always use at least two vertex buffers for compute, one for 794 * parameters and one for global memory */ 795 ctx->cs_vertex_buffer_state.enabled_mask = 796 ctx->cs_vertex_buffer_state.dirty_mask = 1 | 2; 797} 798 799 800struct pipe_resource *r600_compute_global_buffer_create( 801 struct pipe_screen *screen, 802 const struct pipe_resource *templ) 803{ 804 struct r600_resource_global* result = NULL; 805 struct r600_screen* rscreen = NULL; 806 int size_in_dw = 0; 807 808 assert(templ->target == PIPE_BUFFER); 809 assert(templ->bind & PIPE_BIND_GLOBAL); 810 assert(templ->array_size == 1 || templ->array_size == 0); 811 assert(templ->depth0 == 1 || templ->depth0 == 0); 812 assert(templ->height0 == 1 || templ->height0 == 0); 813 814 result = (struct r600_resource_global*) 815 CALLOC(sizeof(struct r600_resource_global), 1); 816 rscreen = (struct r600_screen*)screen; 817 818 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n"); 819 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0, 820 templ->array_size); 821 822 result->base.b.vtbl = &r600_global_buffer_vtbl; 823 result->base.b.b.screen = screen; 824 result->base.b.b = *templ; 825 pipe_reference_init(&result->base.b.b.reference, 1); 826 827 size_in_dw = (templ->width0+3) / 4; 828 829 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw); 830 831 if (result->chunk == NULL) 832 { 833 free(result); 834 return NULL; 835 } 836 837 return &result->base.b.b; 838} 839 840void r600_compute_global_buffer_destroy( 841 struct pipe_screen *screen, 842 struct pipe_resource *res) 843{ 844 struct r600_resource_global* buffer = NULL; 845 struct r600_screen* rscreen = NULL; 846 847 assert(res->target == PIPE_BUFFER); 848 assert(res->bind & PIPE_BIND_GLOBAL); 849 850 buffer = (struct r600_resource_global*)res; 851 rscreen = (struct r600_screen*)screen; 852 853 compute_memory_free(rscreen->global_pool, buffer->chunk->id); 854 855 buffer->chunk = NULL; 856 free(res); 857} 858 859void *r600_compute_global_transfer_map( 860 struct pipe_context *ctx_, 861 struct pipe_resource *resource, 862 unsigned level, 863 unsigned usage, 864 const struct pipe_box *box, 865 struct pipe_transfer **ptransfer) 866{ 867 struct r600_context *rctx = (struct r600_context*)ctx_; 868 struct compute_memory_pool *pool = rctx->screen->global_pool; 869 struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers); 870 struct r600_resource_global* buffer = 871 (struct r600_resource_global*)resource; 872 uint32_t* map; 873 874 compute_memory_finalize_pending(pool, ctx_); 875 876 assert(resource->target == PIPE_BUFFER); 877 878 COMPUTE_DBG(rctx->screen, "* r600_compute_global_get_transfer()\n" 879 "level = %u, usage = %u, box(x = %u, y = %u, z = %u " 880 "width = %u, height = %u, depth = %u)\n", level, usage, 881 box->x, box->y, box->z, box->width, box->height, 882 box->depth); 883 884 transfer->resource = resource; 885 transfer->level = level; 886 transfer->usage = usage; 887 transfer->box = *box; 888 transfer->stride = 0; 889 transfer->layer_stride = 0; 890 891 assert(transfer->resource->target == PIPE_BUFFER); 892 assert(transfer->resource->bind & PIPE_BIND_GLOBAL); 893 assert(transfer->box.x >= 0); 894 assert(transfer->box.y == 0); 895 assert(transfer->box.z == 0); 896 897 ///TODO: do it better, mapping is not possible if the pool is too big 898 899 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"); 900 901 if (!(map = r600_buffer_mmap_sync_with_rings(rctx, buffer->chunk->pool->bo, transfer->usage))) { 902 util_slab_free(&rctx->pool_transfers, transfer); 903 return NULL; 904 } 905 906 *ptransfer = transfer; 907 908 COMPUTE_DBG(rctx->screen, "Buffer: %p + %u (buffer offset in global memory) " 909 "+ %u (box.x)\n", map, buffer->chunk->start_in_dw, transfer->box.x); 910 return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x; 911} 912 913void r600_compute_global_transfer_unmap( 914 struct pipe_context *ctx_, 915 struct pipe_transfer* transfer) 916{ 917 struct r600_context *ctx = NULL; 918 struct r600_resource_global* buffer = NULL; 919 920 assert(transfer->resource->target == PIPE_BUFFER); 921 assert(transfer->resource->bind & PIPE_BIND_GLOBAL); 922 923 ctx = (struct r600_context *)ctx_; 924 buffer = (struct r600_resource_global*)transfer->resource; 925 926 COMPUTE_DBG(ctx->screen, "* r600_compute_global_transfer_unmap()\n"); 927 928 ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf); 929 util_slab_free(&ctx->pool_transfers, transfer); 930} 931 932void r600_compute_global_transfer_flush_region( 933 struct pipe_context *ctx_, 934 struct pipe_transfer *transfer, 935 const struct pipe_box *box) 936{ 937 assert(0 && "TODO"); 938} 939 940void r600_compute_global_transfer_inline_write( 941 struct pipe_context *pipe, 942 struct pipe_resource *resource, 943 unsigned level, 944 unsigned usage, 945 const struct pipe_box *box, 946 const void *data, 947 unsigned stride, 948 unsigned layer_stride) 949{ 950 assert(0 && "TODO"); 951} 952