r600_hw_context.c revision 708337e62e86cfb2df893f0733bb7c5a4938fab6
1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jerome Glisse 25 */ 26#include "r600_hw_context_priv.h" 27#include "radeonsi_pm4.h" 28#include "radeonsi_pipe.h" 29#include "sid.h" 30#include "util/u_memory.h" 31#include <errno.h> 32 33#define GROUP_FORCE_NEW_BLOCK 0 34 35/* Get backends mask */ 36void r600_get_backend_mask(struct r600_context *ctx) 37{ 38 struct radeon_winsys_cs *cs = ctx->cs; 39 struct si_resource *buffer; 40 uint32_t *results; 41 unsigned num_backends = ctx->screen->info.r600_num_backends; 42 unsigned i, mask = 0; 43 44 /* if backend_map query is supported by the kernel */ 45 if (ctx->screen->info.r600_backend_map_valid) { 46 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes; 47 unsigned backend_map = ctx->screen->info.r600_backend_map; 48 unsigned item_width, item_mask; 49 50 if (ctx->chip_class >= CAYMAN) { 51 item_width = 4; 52 item_mask = 0x7; 53 } 54 55 while(num_tile_pipes--) { 56 i = backend_map & item_mask; 57 mask |= (1<<i); 58 backend_map >>= item_width; 59 } 60 if (mask != 0) { 61 ctx->backend_mask = mask; 62 return; 63 } 64 } 65 66 /* otherwise backup path for older kernels */ 67 68 /* create buffer for event data */ 69 buffer = si_resource_create_custom(&ctx->screen->screen, 70 PIPE_USAGE_STAGING, 71 ctx->max_db*16); 72 if (!buffer) 73 goto err; 74 75 /* initialize buffer with zeroes */ 76 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE); 77 if (results) { 78 uint64_t va = 0; 79 80 memset(results, 0, ctx->max_db * 4 * 4); 81 ctx->ws->buffer_unmap(buffer->cs_buf); 82 83 /* emit EVENT_WRITE for ZPASS_DONE */ 84 va = r600_resource_va(&ctx->screen->screen, (void *)buffer); 85 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 86 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); 87 cs->buf[cs->cdw++] = va; 88 cs->buf[cs->cdw++] = va >> 32; 89 90 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 91 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE); 92 93 /* analyze results */ 94 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ); 95 if (results) { 96 for(i = 0; i < ctx->max_db; i++) { 97 /* at least highest bit will be set if backend is used */ 98 if (results[i*4 + 1]) 99 mask |= (1<<i); 100 } 101 ctx->ws->buffer_unmap(buffer->cs_buf); 102 } 103 } 104 105 si_resource_reference(&buffer, NULL); 106 107 if (mask != 0) { 108 ctx->backend_mask = mask; 109 return; 110 } 111 112err: 113 /* fallback to old method - set num_backends lower bits to 1 */ 114 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends); 115 return; 116} 117 118static inline void r600_context_ps_partial_flush(struct r600_context *ctx) 119{ 120 struct radeon_winsys_cs *cs = ctx->cs; 121 122 if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING)) 123 return; 124 125 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); 126 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); 127 128 ctx->flags &= ~R600_CONTEXT_DRAW_PENDING; 129} 130 131/* initialize */ 132void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, 133 boolean count_draw_in) 134{ 135 struct r600_atom *state; 136 137 /* The number of dwords we already used in the CS so far. */ 138 num_dw += ctx->cs->cdw; 139 140 if (count_draw_in) { 141 /* The number of dwords all the dirty states would take. */ 142 LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) { 143 num_dw += state->num_dw; 144 } 145 146 num_dw += ctx->pm4_dirty_cdwords; 147 148 /* The upper-bound of how much a draw command would take. */ 149 num_dw += SI_MAX_DRAW_CS_DWORDS; 150 } 151 152 /* Count in queries_suspend. */ 153 num_dw += ctx->num_cs_dw_queries_suspend; 154 155 /* Count in streamout_end at the end of CS. */ 156 num_dw += ctx->num_cs_dw_streamout_end; 157 158 /* Count in render_condition(NULL) at the end of CS. */ 159 if (ctx->predicate_drawing) { 160 num_dw += 3; 161 } 162 163 /* Count in framebuffer cache flushes at the end of CS. */ 164 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */ 165 166 /* Save 16 dwords for the fence mechanism. */ 167 num_dw += 16; 168 169 /* Flush if there's not enough space. */ 170 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { 171 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC); 172 } 173} 174 175static void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now) 176{ 177 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY)) 178 return; 179 180 ctx->atom_surface_sync.flush_flags |= 181 r600_get_cb_flush_flags(ctx) | 182 (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0); 183 184 if (flush_now) { 185 r600_emit_atom(ctx, &ctx->atom_surface_sync.atom); 186 } else { 187 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); 188 } 189 190 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY; 191} 192 193void r600_context_flush(struct r600_context *ctx, unsigned flags) 194{ 195 struct radeon_winsys_cs *cs = ctx->cs; 196 struct r600_block *enable_block = NULL; 197 bool queries_suspended = false; 198 bool streamout_suspended = false; 199 200 if (!cs->cdw) 201 return; 202 203 /* suspend queries */ 204 if (ctx->num_cs_dw_queries_suspend) { 205 r600_context_queries_suspend(ctx); 206 queries_suspended = true; 207 } 208 209 if (ctx->num_cs_dw_streamout_end) { 210 r600_context_streamout_end(ctx); 211 streamout_suspended = true; 212 } 213 214 r600_flush_framebuffer(ctx, true); 215 216 /* partial flush is needed to avoid lockups on some chips with user fences */ 217 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); 218 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); 219 220 /* force to keep tiling flags */ 221 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; 222 223 /* Flush the CS. */ 224 ctx->ws->cs_flush(ctx->cs, flags); 225 226 ctx->pm4_dirty_cdwords = 0; 227 ctx->flags = 0; 228 229 if (streamout_suspended) { 230 ctx->streamout_start = TRUE; 231 ctx->streamout_append_bitmask = ~0; 232 } 233 234 /* resume queries */ 235 if (queries_suspended) { 236 r600_context_queries_resume(ctx); 237 } 238 239 /* set all valid group as dirty so they get reemited on 240 * next draw command 241 */ 242 si_pm4_reset_emitted(ctx); 243} 244 245void r600_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value) 246{ 247 struct radeon_winsys_cs *cs = ctx->cs; 248 uint64_t va; 249 250 r600_need_cs_space(ctx, 10, FALSE); 251 252 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo); 253 va = va + (offset << 2); 254 255 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); 256 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); 257 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); 258 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); 259 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */ 260 /* DATA_SEL | INT_EN | ADDRESS_HI */ 261 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF); 262 cs->buf[cs->cdw++] = value; /* DATA_LO */ 263 cs->buf[cs->cdw++] = 0; /* DATA_HI */ 264 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 265 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE); 266} 267 268static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index, 269 bool test_status_bit) 270{ 271 uint32_t *current_result = (uint32_t*)map; 272 uint64_t start, end; 273 274 start = (uint64_t)current_result[start_index] | 275 (uint64_t)current_result[start_index+1] << 32; 276 end = (uint64_t)current_result[end_index] | 277 (uint64_t)current_result[end_index+1] << 32; 278 279 if (!test_status_bit || 280 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { 281 return end - start; 282 } 283 return 0; 284} 285 286static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait) 287{ 288 unsigned results_base = query->results_start; 289 char *map; 290 291 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, 292 PIPE_TRANSFER_READ | 293 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK)); 294 if (!map) 295 return FALSE; 296 297 /* count all results across all data blocks */ 298 switch (query->type) { 299 case PIPE_QUERY_OCCLUSION_COUNTER: 300 while (results_base != query->results_end) { 301 query->result.u64 += 302 r600_query_read_result(map + results_base, 0, 2, true); 303 results_base = (results_base + 16) % query->buffer->b.b.width0; 304 } 305 break; 306 case PIPE_QUERY_OCCLUSION_PREDICATE: 307 while (results_base != query->results_end) { 308 query->result.b = query->result.b || 309 r600_query_read_result(map + results_base, 0, 2, true) != 0; 310 results_base = (results_base + 16) % query->buffer->b.b.width0; 311 } 312 break; 313 case PIPE_QUERY_TIME_ELAPSED: 314 while (results_base != query->results_end) { 315 query->result.u64 += 316 r600_query_read_result(map + results_base, 0, 2, false); 317 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 318 } 319 break; 320 case PIPE_QUERY_PRIMITIVES_EMITTED: 321 /* SAMPLE_STREAMOUTSTATS stores this structure: 322 * { 323 * u64 NumPrimitivesWritten; 324 * u64 PrimitiveStorageNeeded; 325 * } 326 * We only need NumPrimitivesWritten here. */ 327 while (results_base != query->results_end) { 328 query->result.u64 += 329 r600_query_read_result(map + results_base, 2, 6, true); 330 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 331 } 332 break; 333 case PIPE_QUERY_PRIMITIVES_GENERATED: 334 /* Here we read PrimitiveStorageNeeded. */ 335 while (results_base != query->results_end) { 336 query->result.u64 += 337 r600_query_read_result(map + results_base, 0, 4, true); 338 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 339 } 340 break; 341 case PIPE_QUERY_SO_STATISTICS: 342 while (results_base != query->results_end) { 343 query->result.so.num_primitives_written += 344 r600_query_read_result(map + results_base, 2, 6, true); 345 query->result.so.primitives_storage_needed += 346 r600_query_read_result(map + results_base, 0, 4, true); 347 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 348 } 349 break; 350 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 351 while (results_base != query->results_end) { 352 query->result.b = query->result.b || 353 r600_query_read_result(map + results_base, 2, 6, true) != 354 r600_query_read_result(map + results_base, 0, 4, true); 355 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 356 } 357 break; 358 default: 359 assert(0); 360 } 361 362 query->results_start = query->results_end; 363 ctx->ws->buffer_unmap(query->buffer->cs_buf); 364 return TRUE; 365} 366 367void r600_query_begin(struct r600_context *ctx, struct r600_query *query) 368{ 369 struct radeon_winsys_cs *cs = ctx->cs; 370 unsigned new_results_end, i; 371 uint32_t *results; 372 uint64_t va; 373 374 r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE); 375 376 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0; 377 378 /* collect current results if query buffer is full */ 379 if (new_results_end == query->results_start) { 380 r600_query_result(ctx, query, TRUE); 381 } 382 383 switch (query->type) { 384 case PIPE_QUERY_OCCLUSION_COUNTER: 385 case PIPE_QUERY_OCCLUSION_PREDICATE: 386 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE); 387 if (results) { 388 results = (uint32_t*)((char*)results + query->results_end); 389 memset(results, 0, query->result_size); 390 391 /* Set top bits for unused backends */ 392 for (i = 0; i < ctx->max_db; i++) { 393 if (!(ctx->backend_mask & (1<<i))) { 394 results[(i * 4)+1] = 0x80000000; 395 results[(i * 4)+3] = 0x80000000; 396 } 397 } 398 ctx->ws->buffer_unmap(query->buffer->cs_buf); 399 } 400 break; 401 case PIPE_QUERY_TIME_ELAPSED: 402 break; 403 case PIPE_QUERY_PRIMITIVES_EMITTED: 404 case PIPE_QUERY_PRIMITIVES_GENERATED: 405 case PIPE_QUERY_SO_STATISTICS: 406 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 407 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE); 408 results = (uint32_t*)((char*)results + query->results_end); 409 memset(results, 0, query->result_size); 410 ctx->ws->buffer_unmap(query->buffer->cs_buf); 411 break; 412 default: 413 assert(0); 414 } 415 416 /* emit begin query */ 417 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); 418 va += query->results_end; 419 420 switch (query->type) { 421 case PIPE_QUERY_OCCLUSION_COUNTER: 422 case PIPE_QUERY_OCCLUSION_PREDICATE: 423 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 424 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); 425 cs->buf[cs->cdw++] = va; 426 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; 427 break; 428 case PIPE_QUERY_PRIMITIVES_EMITTED: 429 case PIPE_QUERY_PRIMITIVES_GENERATED: 430 case PIPE_QUERY_SO_STATISTICS: 431 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 432 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 433 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); 434 cs->buf[cs->cdw++] = query->results_end; 435 cs->buf[cs->cdw++] = 0; 436 break; 437 case PIPE_QUERY_TIME_ELAPSED: 438 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); 439 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); 440 cs->buf[cs->cdw++] = va; 441 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF); 442 cs->buf[cs->cdw++] = 0; 443 cs->buf[cs->cdw++] = 0; 444 break; 445 default: 446 assert(0); 447 } 448 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 449 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE); 450 451 ctx->num_cs_dw_queries_suspend += query->num_cs_dw; 452} 453 454void r600_query_end(struct r600_context *ctx, struct r600_query *query) 455{ 456 struct radeon_winsys_cs *cs = ctx->cs; 457 uint64_t va; 458 459 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); 460 /* emit end query */ 461 switch (query->type) { 462 case PIPE_QUERY_OCCLUSION_COUNTER: 463 case PIPE_QUERY_OCCLUSION_PREDICATE: 464 va += query->results_end + 8; 465 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 466 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); 467 cs->buf[cs->cdw++] = va; 468 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; 469 break; 470 case PIPE_QUERY_PRIMITIVES_EMITTED: 471 case PIPE_QUERY_PRIMITIVES_GENERATED: 472 case PIPE_QUERY_SO_STATISTICS: 473 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 474 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 475 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); 476 cs->buf[cs->cdw++] = query->results_end + query->result_size/2; 477 cs->buf[cs->cdw++] = 0; 478 break; 479 case PIPE_QUERY_TIME_ELAPSED: 480 va += query->results_end + query->result_size/2; 481 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); 482 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); 483 cs->buf[cs->cdw++] = va; 484 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF); 485 cs->buf[cs->cdw++] = 0; 486 cs->buf[cs->cdw++] = 0; 487 break; 488 default: 489 assert(0); 490 } 491 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 492 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE); 493 494 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0; 495 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw; 496} 497 498void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation, 499 int flag_wait) 500{ 501 struct radeon_winsys_cs *cs = ctx->cs; 502 uint64_t va; 503 504 if (operation == PREDICATION_OP_CLEAR) { 505 r600_need_cs_space(ctx, 3, FALSE); 506 507 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0); 508 cs->buf[cs->cdw++] = 0; 509 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR); 510 } else { 511 unsigned results_base = query->results_start; 512 unsigned count; 513 uint32_t op; 514 515 /* find count of the query data blocks */ 516 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0; 517 count /= query->result_size; 518 519 r600_need_cs_space(ctx, 5 * count, TRUE); 520 521 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE | 522 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW); 523 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); 524 525 /* emit predicate packets for all data blocks */ 526 while (results_base != query->results_end) { 527 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0); 528 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL; 529 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF); 530 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 531 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, 532 RADEON_USAGE_READ); 533 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 534 535 /* set CONTINUE bit for all packets except the first */ 536 op |= PREDICATION_CONTINUE; 537 } 538 } 539} 540 541struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type) 542{ 543 struct r600_query *query; 544 unsigned buffer_size = 4096; 545 546 query = CALLOC_STRUCT(r600_query); 547 if (query == NULL) 548 return NULL; 549 550 query->type = query_type; 551 552 switch (query_type) { 553 case PIPE_QUERY_OCCLUSION_COUNTER: 554 case PIPE_QUERY_OCCLUSION_PREDICATE: 555 query->result_size = 16 * ctx->max_db; 556 query->num_cs_dw = 6; 557 break; 558 case PIPE_QUERY_TIME_ELAPSED: 559 query->result_size = 16; 560 query->num_cs_dw = 8; 561 break; 562 case PIPE_QUERY_PRIMITIVES_EMITTED: 563 case PIPE_QUERY_PRIMITIVES_GENERATED: 564 case PIPE_QUERY_SO_STATISTICS: 565 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 566 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ 567 query->result_size = 32; 568 query->num_cs_dw = 6; 569 break; 570 default: 571 assert(0); 572 FREE(query); 573 return NULL; 574 } 575 576 /* adjust buffer size to simplify offsets wrapping math */ 577 buffer_size -= buffer_size % query->result_size; 578 579 /* Queries are normally read by the CPU after 580 * being written by the gpu, hence staging is probably a good 581 * usage pattern. 582 */ 583 query->buffer = si_resource_create_custom(&ctx->screen->screen, 584 PIPE_USAGE_STAGING, 585 buffer_size); 586 if (!query->buffer) { 587 FREE(query); 588 return NULL; 589 } 590 return query; 591} 592 593void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query) 594{ 595 si_resource_reference(&query->buffer, NULL); 596 free(query); 597} 598 599boolean r600_context_query_result(struct r600_context *ctx, 600 struct r600_query *query, 601 boolean wait, void *vresult) 602{ 603 boolean *result_b = (boolean*)vresult; 604 uint64_t *result_u64 = (uint64_t*)vresult; 605 struct pipe_query_data_so_statistics *result_so = 606 (struct pipe_query_data_so_statistics*)vresult; 607 608 if (!r600_query_result(ctx, query, wait)) 609 return FALSE; 610 611 switch (query->type) { 612 case PIPE_QUERY_OCCLUSION_COUNTER: 613 case PIPE_QUERY_PRIMITIVES_EMITTED: 614 case PIPE_QUERY_PRIMITIVES_GENERATED: 615 *result_u64 = query->result.u64; 616 break; 617 case PIPE_QUERY_OCCLUSION_PREDICATE: 618 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 619 *result_b = query->result.b; 620 break; 621 case PIPE_QUERY_TIME_ELAPSED: 622 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq; 623 break; 624 case PIPE_QUERY_SO_STATISTICS: 625 *result_so = query->result.so; 626 break; 627 default: 628 assert(0); 629 } 630 return TRUE; 631} 632 633void r600_context_queries_suspend(struct r600_context *ctx) 634{ 635 struct r600_query *query; 636 637 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) { 638 r600_query_end(ctx, query); 639 } 640 assert(ctx->num_cs_dw_queries_suspend == 0); 641} 642 643void r600_context_queries_resume(struct r600_context *ctx) 644{ 645 struct r600_query *query; 646 647 assert(ctx->num_cs_dw_queries_suspend == 0); 648 649 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) { 650 r600_query_begin(ctx, query); 651 } 652} 653 654void r600_context_streamout_begin(struct r600_context *ctx) 655{ 656 struct radeon_winsys_cs *cs = ctx->cs; 657 struct r600_so_target **t = ctx->so_targets; 658 unsigned *strides = ctx->vs_shader_so_strides; 659 unsigned buffer_en, i; 660 661 buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) | 662 (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) | 663 (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) | 664 (ctx->num_so_targets >= 4 && t[3] ? 8 : 0); 665 666 ctx->num_cs_dw_streamout_end = 667 12 + /* flush_vgt_streamout */ 668 util_bitcount(buffer_en) * 8 + 669 3; 670 671 r600_need_cs_space(ctx, 672 12 + /* flush_vgt_streamout */ 673 6 + /* enables */ 674 util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 + 675 util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 + 676 ctx->num_cs_dw_streamout_end, TRUE); 677 678 if (ctx->chip_class >= CAYMAN) { 679 evergreen_flush_vgt_streamout(ctx); 680 evergreen_set_streamout_enable(ctx, buffer_en); 681 } 682 683 for (i = 0; i < ctx->num_so_targets; i++) { 684#if 0 685 if (t[i]) { 686 t[i]->stride = strides[i]; 687 t[i]->so_index = i; 688 689 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0); 690 cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 691 16*i - SI_CONTEXT_REG_OFFSET) >> 2; 692 cs->buf[cs->cdw++] = (t[i]->b.buffer_offset + 693 t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */ 694 cs->buf[cs->cdw++] = strides[i] >> 2; /* VTX_STRIDE (in DW) */ 695 cs->buf[cs->cdw++] = 0; /* BUFFER_BASE */ 696 697 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 698 cs->buf[cs->cdw++] = 699 r600_context_bo_reloc(ctx, si_resource(t[i]->b.buffer), 700 RADEON_USAGE_WRITE); 701 702 if (ctx->streamout_append_bitmask & (1 << i)) { 703 /* Append. */ 704 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); 705 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | 706 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */ 707 cs->buf[cs->cdw++] = 0; /* unused */ 708 cs->buf[cs->cdw++] = 0; /* unused */ 709 cs->buf[cs->cdw++] = 0; /* src address lo */ 710 cs->buf[cs->cdw++] = 0; /* src address hi */ 711 712 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 713 cs->buf[cs->cdw++] = 714 r600_context_bo_reloc(ctx, t[i]->filled_size, 715 RADEON_USAGE_READ); 716 } else { 717 /* Start from the beginning. */ 718 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); 719 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | 720 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */ 721 cs->buf[cs->cdw++] = 0; /* unused */ 722 cs->buf[cs->cdw++] = 0; /* unused */ 723 cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */ 724 cs->buf[cs->cdw++] = 0; /* unused */ 725 } 726 } 727#endif 728 } 729} 730 731void r600_context_streamout_end(struct r600_context *ctx) 732{ 733 struct radeon_winsys_cs *cs = ctx->cs; 734 struct r600_so_target **t = ctx->so_targets; 735 unsigned i, flush_flags = 0; 736 737 evergreen_flush_vgt_streamout(ctx); 738 739 for (i = 0; i < ctx->num_so_targets; i++) { 740#if 0 741 if (t[i]) { 742 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); 743 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | 744 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 745 STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */ 746 cs->buf[cs->cdw++] = 0; /* dst address lo */ 747 cs->buf[cs->cdw++] = 0; /* dst address hi */ 748 cs->buf[cs->cdw++] = 0; /* unused */ 749 cs->buf[cs->cdw++] = 0; /* unused */ 750 751 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 752 cs->buf[cs->cdw++] = 753 r600_context_bo_reloc(ctx, t[i]->filled_size, 754 RADEON_USAGE_WRITE); 755 756 flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i; 757 } 758#endif 759 } 760 761 evergreen_set_streamout_enable(ctx, 0); 762 763 ctx->atom_surface_sync.flush_flags |= flush_flags; 764 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); 765 766 ctx->num_cs_dw_streamout_end = 0; 767 768 /* XXX print some debug info */ 769 for (i = 0; i < ctx->num_so_targets; i++) { 770 if (!t[i]) 771 continue; 772 773 uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->cs_buf, ctx->cs, RADEON_USAGE_READ); 774 printf("FILLED_SIZE%i: %u\n", i, *ptr); 775 ctx->ws->buffer_unmap(t[i]->filled_size->cs_buf); 776 } 777} 778 779void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t) 780{ 781 struct radeon_winsys_cs *cs = ctx->cs; 782 r600_need_cs_space(ctx, 14 + 21, TRUE); 783 784 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); 785 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2; 786 cs->buf[cs->cdw++] = 0; 787 788 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); 789 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2; 790 cs->buf[cs->cdw++] = t->stride >> 2; 791 792#if 0 793 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0); 794 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG; 795 cs->buf[cs->cdw++] = 0; /* src address lo */ 796 cs->buf[cs->cdw++] = 0; /* src address hi */ 797 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */ 798 cs->buf[cs->cdw++] = 0; /* unused */ 799#endif 800 801 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 802 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ); 803 804#if 0 /* I have not found this useful yet. */ 805 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0); 806 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG; 807 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */ 808 cs->buf[cs->cdw++] = 0; /* unused */ 809 cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */ 810 cs->buf[cs->cdw++] = 0; /* unused */ 811 812 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); 813 cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2; 814 cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index; 815 816 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); 817 cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2; 818 cs->buf[cs->cdw++] = t->b.buffer_offset >> 2; 819 820 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 821 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer, 822 RADEON_USAGE_WRITE); 823 824 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); 825 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ 826 cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */ 827 cs->buf[cs->cdw++] = 0; 828 cs->buf[cs->cdw++] = 0; /* reference value */ 829 cs->buf[cs->cdw++] = 0xffffffff; /* mask */ 830 cs->buf[cs->cdw++] = 4; /* poll interval */ 831#endif 832} 833