r600_hw_context.c revision 696b6cf46609281711add5331b9c3e1d0240ecbc
1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jerome Glisse 25 */ 26#include "r600_hw_context_priv.h" 27#include "radeonsi_pm4.h" 28#include "radeonsi_pipe.h" 29#include "sid.h" 30#include "util/u_memory.h" 31#include <errno.h> 32 33#define GROUP_FORCE_NEW_BLOCK 0 34 35/* Get backends mask */ 36void r600_get_backend_mask(struct r600_context *ctx) 37{ 38 struct radeon_winsys_cs *cs = ctx->cs; 39 struct si_resource *buffer; 40 uint32_t *results; 41 unsigned num_backends = ctx->screen->info.r600_num_backends; 42 unsigned i, mask = 0; 43 44 /* if backend_map query is supported by the kernel */ 45 if (ctx->screen->info.r600_backend_map_valid) { 46 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes; 47 unsigned backend_map = ctx->screen->info.r600_backend_map; 48 unsigned item_width, item_mask; 49 50 if (ctx->chip_class >= CAYMAN) { 51 item_width = 4; 52 item_mask = 0x7; 53 } 54 55 while(num_tile_pipes--) { 56 i = backend_map & item_mask; 57 mask |= (1<<i); 58 backend_map >>= item_width; 59 } 60 if (mask != 0) { 61 ctx->backend_mask = mask; 62 return; 63 } 64 } 65 66 /* otherwise backup path for older kernels */ 67 68 /* create buffer for event data */ 69 buffer = si_resource_create_custom(&ctx->screen->screen, 70 PIPE_USAGE_STAGING, 71 ctx->max_db*16); 72 if (!buffer) 73 goto err; 74 75 /* initialize buffer with zeroes */ 76 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE); 77 if (results) { 78 uint64_t va = 0; 79 80 memset(results, 0, ctx->max_db * 4 * 4); 81 ctx->ws->buffer_unmap(buffer->cs_buf); 82 83 /* emit EVENT_WRITE for ZPASS_DONE */ 84 va = r600_resource_va(&ctx->screen->screen, (void *)buffer); 85 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 86 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); 87 cs->buf[cs->cdw++] = va; 88 cs->buf[cs->cdw++] = va >> 32; 89 90 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 91 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE); 92 93 /* analyze results */ 94 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ); 95 if (results) { 96 for(i = 0; i < ctx->max_db; i++) { 97 /* at least highest bit will be set if backend is used */ 98 if (results[i*4 + 1]) 99 mask |= (1<<i); 100 } 101 ctx->ws->buffer_unmap(buffer->cs_buf); 102 } 103 } 104 105 si_resource_reference(&buffer, NULL); 106 107 if (mask != 0) { 108 ctx->backend_mask = mask; 109 return; 110 } 111 112err: 113 /* fallback to old method - set num_backends lower bits to 1 */ 114 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends); 115 return; 116} 117 118/* initialize */ 119void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, 120 boolean count_draw_in) 121{ 122 struct r600_atom *state; 123 124 /* The number of dwords we already used in the CS so far. */ 125 num_dw += ctx->cs->cdw; 126 127 if (count_draw_in) { 128 /* The number of dwords all the dirty states would take. */ 129 LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) { 130 num_dw += state->num_dw; 131 } 132 133 num_dw += ctx->pm4_dirty_cdwords; 134 135 /* The upper-bound of how much a draw command would take. */ 136 num_dw += SI_MAX_DRAW_CS_DWORDS; 137 } 138 139 /* Count in queries_suspend. */ 140 num_dw += ctx->num_cs_dw_queries_suspend; 141 142 /* Count in streamout_end at the end of CS. */ 143 num_dw += ctx->num_cs_dw_streamout_end; 144 145 /* Count in render_condition(NULL) at the end of CS. */ 146 if (ctx->predicate_drawing) { 147 num_dw += 3; 148 } 149 150 /* Count in framebuffer cache flushes at the end of CS. */ 151 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */ 152 153 /* Save 16 dwords for the fence mechanism. */ 154 num_dw += 16; 155 156 /* Flush if there's not enough space. */ 157 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { 158 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC); 159 } 160} 161 162static void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now) 163{ 164 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY)) 165 return; 166 167 ctx->atom_surface_sync.flush_flags |= 168 r600_get_cb_flush_flags(ctx) | 169 (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0); 170 171 if (flush_now) { 172 r600_emit_atom(ctx, &ctx->atom_surface_sync.atom); 173 } else { 174 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); 175 } 176 177 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY; 178} 179 180void r600_context_flush(struct r600_context *ctx, unsigned flags) 181{ 182 struct radeon_winsys_cs *cs = ctx->cs; 183 struct r600_block *enable_block = NULL; 184 bool queries_suspended = false; 185 bool streamout_suspended = false; 186 187 if (!cs->cdw) 188 return; 189 190 /* suspend queries */ 191 if (ctx->num_cs_dw_queries_suspend) { 192 r600_context_queries_suspend(ctx); 193 queries_suspended = true; 194 } 195 196 if (ctx->num_cs_dw_streamout_end) { 197 r600_context_streamout_end(ctx); 198 streamout_suspended = true; 199 } 200 201 r600_flush_framebuffer(ctx, true); 202 203 /* partial flush is needed to avoid lockups on some chips with user fences */ 204 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); 205 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); 206 207 /* force to keep tiling flags */ 208 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; 209 210 /* Flush the CS. */ 211 ctx->ws->cs_flush(ctx->cs, flags); 212 213 ctx->pm4_dirty_cdwords = 0; 214 ctx->flags = 0; 215 216 if (streamout_suspended) { 217 ctx->streamout_start = TRUE; 218 ctx->streamout_append_bitmask = ~0; 219 } 220 221 /* resume queries */ 222 if (queries_suspended) { 223 r600_context_queries_resume(ctx); 224 } 225 226 /* set all valid group as dirty so they get reemited on 227 * next draw command 228 */ 229 si_pm4_reset_emitted(ctx); 230} 231 232void r600_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value) 233{ 234 struct radeon_winsys_cs *cs = ctx->cs; 235 uint64_t va; 236 237 r600_need_cs_space(ctx, 10, FALSE); 238 239 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo); 240 va = va + (offset << 2); 241 242 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); 243 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); 244 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); 245 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); 246 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */ 247 /* DATA_SEL | INT_EN | ADDRESS_HI */ 248 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF); 249 cs->buf[cs->cdw++] = value; /* DATA_LO */ 250 cs->buf[cs->cdw++] = 0; /* DATA_HI */ 251 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 252 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE); 253} 254 255static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index, 256 bool test_status_bit) 257{ 258 uint32_t *current_result = (uint32_t*)map; 259 uint64_t start, end; 260 261 start = (uint64_t)current_result[start_index] | 262 (uint64_t)current_result[start_index+1] << 32; 263 end = (uint64_t)current_result[end_index] | 264 (uint64_t)current_result[end_index+1] << 32; 265 266 if (!test_status_bit || 267 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { 268 return end - start; 269 } 270 return 0; 271} 272 273static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait) 274{ 275 unsigned results_base = query->results_start; 276 char *map; 277 278 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, 279 PIPE_TRANSFER_READ | 280 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK)); 281 if (!map) 282 return FALSE; 283 284 /* count all results across all data blocks */ 285 switch (query->type) { 286 case PIPE_QUERY_OCCLUSION_COUNTER: 287 while (results_base != query->results_end) { 288 query->result.u64 += 289 r600_query_read_result(map + results_base, 0, 2, true); 290 results_base = (results_base + 16) % query->buffer->b.b.width0; 291 } 292 break; 293 case PIPE_QUERY_OCCLUSION_PREDICATE: 294 while (results_base != query->results_end) { 295 query->result.b = query->result.b || 296 r600_query_read_result(map + results_base, 0, 2, true) != 0; 297 results_base = (results_base + 16) % query->buffer->b.b.width0; 298 } 299 break; 300 case PIPE_QUERY_TIME_ELAPSED: 301 while (results_base != query->results_end) { 302 query->result.u64 += 303 r600_query_read_result(map + results_base, 0, 2, false); 304 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 305 } 306 break; 307 case PIPE_QUERY_PRIMITIVES_EMITTED: 308 /* SAMPLE_STREAMOUTSTATS stores this structure: 309 * { 310 * u64 NumPrimitivesWritten; 311 * u64 PrimitiveStorageNeeded; 312 * } 313 * We only need NumPrimitivesWritten here. */ 314 while (results_base != query->results_end) { 315 query->result.u64 += 316 r600_query_read_result(map + results_base, 2, 6, true); 317 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 318 } 319 break; 320 case PIPE_QUERY_PRIMITIVES_GENERATED: 321 /* Here we read PrimitiveStorageNeeded. */ 322 while (results_base != query->results_end) { 323 query->result.u64 += 324 r600_query_read_result(map + results_base, 0, 4, true); 325 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 326 } 327 break; 328 case PIPE_QUERY_SO_STATISTICS: 329 while (results_base != query->results_end) { 330 query->result.so.num_primitives_written += 331 r600_query_read_result(map + results_base, 2, 6, true); 332 query->result.so.primitives_storage_needed += 333 r600_query_read_result(map + results_base, 0, 4, true); 334 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 335 } 336 break; 337 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 338 while (results_base != query->results_end) { 339 query->result.b = query->result.b || 340 r600_query_read_result(map + results_base, 2, 6, true) != 341 r600_query_read_result(map + results_base, 0, 4, true); 342 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 343 } 344 break; 345 default: 346 assert(0); 347 } 348 349 query->results_start = query->results_end; 350 ctx->ws->buffer_unmap(query->buffer->cs_buf); 351 return TRUE; 352} 353 354void r600_query_begin(struct r600_context *ctx, struct r600_query *query) 355{ 356 struct radeon_winsys_cs *cs = ctx->cs; 357 unsigned new_results_end, i; 358 uint32_t *results; 359 uint64_t va; 360 361 r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE); 362 363 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0; 364 365 /* collect current results if query buffer is full */ 366 if (new_results_end == query->results_start) { 367 r600_query_result(ctx, query, TRUE); 368 } 369 370 switch (query->type) { 371 case PIPE_QUERY_OCCLUSION_COUNTER: 372 case PIPE_QUERY_OCCLUSION_PREDICATE: 373 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE); 374 if (results) { 375 results = (uint32_t*)((char*)results + query->results_end); 376 memset(results, 0, query->result_size); 377 378 /* Set top bits for unused backends */ 379 for (i = 0; i < ctx->max_db; i++) { 380 if (!(ctx->backend_mask & (1<<i))) { 381 results[(i * 4)+1] = 0x80000000; 382 results[(i * 4)+3] = 0x80000000; 383 } 384 } 385 ctx->ws->buffer_unmap(query->buffer->cs_buf); 386 } 387 break; 388 case PIPE_QUERY_TIME_ELAPSED: 389 break; 390 case PIPE_QUERY_PRIMITIVES_EMITTED: 391 case PIPE_QUERY_PRIMITIVES_GENERATED: 392 case PIPE_QUERY_SO_STATISTICS: 393 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 394 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE); 395 results = (uint32_t*)((char*)results + query->results_end); 396 memset(results, 0, query->result_size); 397 ctx->ws->buffer_unmap(query->buffer->cs_buf); 398 break; 399 default: 400 assert(0); 401 } 402 403 /* emit begin query */ 404 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); 405 va += query->results_end; 406 407 switch (query->type) { 408 case PIPE_QUERY_OCCLUSION_COUNTER: 409 case PIPE_QUERY_OCCLUSION_PREDICATE: 410 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 411 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); 412 cs->buf[cs->cdw++] = va; 413 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; 414 break; 415 case PIPE_QUERY_PRIMITIVES_EMITTED: 416 case PIPE_QUERY_PRIMITIVES_GENERATED: 417 case PIPE_QUERY_SO_STATISTICS: 418 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 419 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 420 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); 421 cs->buf[cs->cdw++] = query->results_end; 422 cs->buf[cs->cdw++] = 0; 423 break; 424 case PIPE_QUERY_TIME_ELAPSED: 425 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); 426 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); 427 cs->buf[cs->cdw++] = va; 428 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF); 429 cs->buf[cs->cdw++] = 0; 430 cs->buf[cs->cdw++] = 0; 431 break; 432 default: 433 assert(0); 434 } 435 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 436 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE); 437 438 ctx->num_cs_dw_queries_suspend += query->num_cs_dw; 439} 440 441void r600_query_end(struct r600_context *ctx, struct r600_query *query) 442{ 443 struct radeon_winsys_cs *cs = ctx->cs; 444 uint64_t va; 445 446 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); 447 /* emit end query */ 448 switch (query->type) { 449 case PIPE_QUERY_OCCLUSION_COUNTER: 450 case PIPE_QUERY_OCCLUSION_PREDICATE: 451 va += query->results_end + 8; 452 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 453 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); 454 cs->buf[cs->cdw++] = va; 455 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; 456 break; 457 case PIPE_QUERY_PRIMITIVES_EMITTED: 458 case PIPE_QUERY_PRIMITIVES_GENERATED: 459 case PIPE_QUERY_SO_STATISTICS: 460 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 461 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); 462 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); 463 cs->buf[cs->cdw++] = query->results_end + query->result_size/2; 464 cs->buf[cs->cdw++] = 0; 465 break; 466 case PIPE_QUERY_TIME_ELAPSED: 467 va += query->results_end + query->result_size/2; 468 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); 469 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); 470 cs->buf[cs->cdw++] = va; 471 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF); 472 cs->buf[cs->cdw++] = 0; 473 cs->buf[cs->cdw++] = 0; 474 break; 475 default: 476 assert(0); 477 } 478 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 479 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE); 480 481 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0; 482 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw; 483} 484 485void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation, 486 int flag_wait) 487{ 488 struct radeon_winsys_cs *cs = ctx->cs; 489 uint64_t va; 490 491 if (operation == PREDICATION_OP_CLEAR) { 492 r600_need_cs_space(ctx, 3, FALSE); 493 494 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0); 495 cs->buf[cs->cdw++] = 0; 496 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR); 497 } else { 498 unsigned results_base = query->results_start; 499 unsigned count; 500 uint32_t op; 501 502 /* find count of the query data blocks */ 503 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0; 504 count /= query->result_size; 505 506 r600_need_cs_space(ctx, 5 * count, TRUE); 507 508 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE | 509 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW); 510 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); 511 512 /* emit predicate packets for all data blocks */ 513 while (results_base != query->results_end) { 514 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0); 515 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL; 516 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF); 517 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 518 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, 519 RADEON_USAGE_READ); 520 results_base = (results_base + query->result_size) % query->buffer->b.b.width0; 521 522 /* set CONTINUE bit for all packets except the first */ 523 op |= PREDICATION_CONTINUE; 524 } 525 } 526} 527 528struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type) 529{ 530 struct r600_query *query; 531 unsigned buffer_size = 4096; 532 533 query = CALLOC_STRUCT(r600_query); 534 if (query == NULL) 535 return NULL; 536 537 query->type = query_type; 538 539 switch (query_type) { 540 case PIPE_QUERY_OCCLUSION_COUNTER: 541 case PIPE_QUERY_OCCLUSION_PREDICATE: 542 query->result_size = 16 * ctx->max_db; 543 query->num_cs_dw = 6; 544 break; 545 case PIPE_QUERY_TIME_ELAPSED: 546 query->result_size = 16; 547 query->num_cs_dw = 8; 548 break; 549 case PIPE_QUERY_PRIMITIVES_EMITTED: 550 case PIPE_QUERY_PRIMITIVES_GENERATED: 551 case PIPE_QUERY_SO_STATISTICS: 552 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 553 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ 554 query->result_size = 32; 555 query->num_cs_dw = 6; 556 break; 557 default: 558 assert(0); 559 FREE(query); 560 return NULL; 561 } 562 563 /* adjust buffer size to simplify offsets wrapping math */ 564 buffer_size -= buffer_size % query->result_size; 565 566 /* Queries are normally read by the CPU after 567 * being written by the gpu, hence staging is probably a good 568 * usage pattern. 569 */ 570 query->buffer = si_resource_create_custom(&ctx->screen->screen, 571 PIPE_USAGE_STAGING, 572 buffer_size); 573 if (!query->buffer) { 574 FREE(query); 575 return NULL; 576 } 577 return query; 578} 579 580void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query) 581{ 582 si_resource_reference(&query->buffer, NULL); 583 free(query); 584} 585 586boolean r600_context_query_result(struct r600_context *ctx, 587 struct r600_query *query, 588 boolean wait, void *vresult) 589{ 590 boolean *result_b = (boolean*)vresult; 591 uint64_t *result_u64 = (uint64_t*)vresult; 592 struct pipe_query_data_so_statistics *result_so = 593 (struct pipe_query_data_so_statistics*)vresult; 594 595 if (!r600_query_result(ctx, query, wait)) 596 return FALSE; 597 598 switch (query->type) { 599 case PIPE_QUERY_OCCLUSION_COUNTER: 600 case PIPE_QUERY_PRIMITIVES_EMITTED: 601 case PIPE_QUERY_PRIMITIVES_GENERATED: 602 *result_u64 = query->result.u64; 603 break; 604 case PIPE_QUERY_OCCLUSION_PREDICATE: 605 case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 606 *result_b = query->result.b; 607 break; 608 case PIPE_QUERY_TIME_ELAPSED: 609 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq; 610 break; 611 case PIPE_QUERY_SO_STATISTICS: 612 *result_so = query->result.so; 613 break; 614 default: 615 assert(0); 616 } 617 return TRUE; 618} 619 620void r600_context_queries_suspend(struct r600_context *ctx) 621{ 622 struct r600_query *query; 623 624 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) { 625 r600_query_end(ctx, query); 626 } 627 assert(ctx->num_cs_dw_queries_suspend == 0); 628} 629 630void r600_context_queries_resume(struct r600_context *ctx) 631{ 632 struct r600_query *query; 633 634 assert(ctx->num_cs_dw_queries_suspend == 0); 635 636 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) { 637 r600_query_begin(ctx, query); 638 } 639} 640 641void r600_context_streamout_begin(struct r600_context *ctx) 642{ 643 struct radeon_winsys_cs *cs = ctx->cs; 644 struct r600_so_target **t = ctx->so_targets; 645 unsigned *strides = ctx->vs_shader_so_strides; 646 unsigned buffer_en, i; 647 648 buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) | 649 (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) | 650 (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) | 651 (ctx->num_so_targets >= 4 && t[3] ? 8 : 0); 652 653 ctx->num_cs_dw_streamout_end = 654 12 + /* flush_vgt_streamout */ 655 util_bitcount(buffer_en) * 8 + 656 3; 657 658 r600_need_cs_space(ctx, 659 12 + /* flush_vgt_streamout */ 660 6 + /* enables */ 661 util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 + 662 util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 + 663 ctx->num_cs_dw_streamout_end, TRUE); 664 665 if (ctx->chip_class >= CAYMAN) { 666 evergreen_flush_vgt_streamout(ctx); 667 evergreen_set_streamout_enable(ctx, buffer_en); 668 } 669 670 for (i = 0; i < ctx->num_so_targets; i++) { 671#if 0 672 if (t[i]) { 673 t[i]->stride = strides[i]; 674 t[i]->so_index = i; 675 676 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0); 677 cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 678 16*i - SI_CONTEXT_REG_OFFSET) >> 2; 679 cs->buf[cs->cdw++] = (t[i]->b.buffer_offset + 680 t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */ 681 cs->buf[cs->cdw++] = strides[i] >> 2; /* VTX_STRIDE (in DW) */ 682 cs->buf[cs->cdw++] = 0; /* BUFFER_BASE */ 683 684 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 685 cs->buf[cs->cdw++] = 686 r600_context_bo_reloc(ctx, si_resource(t[i]->b.buffer), 687 RADEON_USAGE_WRITE); 688 689 if (ctx->streamout_append_bitmask & (1 << i)) { 690 /* Append. */ 691 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); 692 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | 693 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */ 694 cs->buf[cs->cdw++] = 0; /* unused */ 695 cs->buf[cs->cdw++] = 0; /* unused */ 696 cs->buf[cs->cdw++] = 0; /* src address lo */ 697 cs->buf[cs->cdw++] = 0; /* src address hi */ 698 699 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 700 cs->buf[cs->cdw++] = 701 r600_context_bo_reloc(ctx, t[i]->filled_size, 702 RADEON_USAGE_READ); 703 } else { 704 /* Start from the beginning. */ 705 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); 706 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | 707 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */ 708 cs->buf[cs->cdw++] = 0; /* unused */ 709 cs->buf[cs->cdw++] = 0; /* unused */ 710 cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */ 711 cs->buf[cs->cdw++] = 0; /* unused */ 712 } 713 } 714#endif 715 } 716} 717 718void r600_context_streamout_end(struct r600_context *ctx) 719{ 720 struct radeon_winsys_cs *cs = ctx->cs; 721 struct r600_so_target **t = ctx->so_targets; 722 unsigned i, flush_flags = 0; 723 724 evergreen_flush_vgt_streamout(ctx); 725 726 for (i = 0; i < ctx->num_so_targets; i++) { 727#if 0 728 if (t[i]) { 729 cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); 730 cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | 731 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 732 STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */ 733 cs->buf[cs->cdw++] = 0; /* dst address lo */ 734 cs->buf[cs->cdw++] = 0; /* dst address hi */ 735 cs->buf[cs->cdw++] = 0; /* unused */ 736 cs->buf[cs->cdw++] = 0; /* unused */ 737 738 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 739 cs->buf[cs->cdw++] = 740 r600_context_bo_reloc(ctx, t[i]->filled_size, 741 RADEON_USAGE_WRITE); 742 743 flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i; 744 } 745#endif 746 } 747 748 evergreen_set_streamout_enable(ctx, 0); 749 750 ctx->atom_surface_sync.flush_flags |= flush_flags; 751 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); 752 753 ctx->num_cs_dw_streamout_end = 0; 754 755 /* XXX print some debug info */ 756 for (i = 0; i < ctx->num_so_targets; i++) { 757 if (!t[i]) 758 continue; 759 760 uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->cs_buf, ctx->cs, RADEON_USAGE_READ); 761 printf("FILLED_SIZE%i: %u\n", i, *ptr); 762 ctx->ws->buffer_unmap(t[i]->filled_size->cs_buf); 763 } 764} 765 766void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t) 767{ 768 struct radeon_winsys_cs *cs = ctx->cs; 769 r600_need_cs_space(ctx, 14 + 21, TRUE); 770 771 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); 772 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2; 773 cs->buf[cs->cdw++] = 0; 774 775 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); 776 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2; 777 cs->buf[cs->cdw++] = t->stride >> 2; 778 779#if 0 780 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0); 781 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG; 782 cs->buf[cs->cdw++] = 0; /* src address lo */ 783 cs->buf[cs->cdw++] = 0; /* src address hi */ 784 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */ 785 cs->buf[cs->cdw++] = 0; /* unused */ 786#endif 787 788 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 789 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ); 790 791#if 0 /* I have not found this useful yet. */ 792 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0); 793 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG; 794 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */ 795 cs->buf[cs->cdw++] = 0; /* unused */ 796 cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */ 797 cs->buf[cs->cdw++] = 0; /* unused */ 798 799 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); 800 cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2; 801 cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index; 802 803 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); 804 cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2; 805 cs->buf[cs->cdw++] = t->b.buffer_offset >> 2; 806 807 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); 808 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer, 809 RADEON_USAGE_WRITE); 810 811 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); 812 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ 813 cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */ 814 cs->buf[cs->cdw++] = 0; 815 cs->buf[cs->cdw++] = 0; /* reference value */ 816 cs->buf[cs->cdw++] = 0xffffffff; /* mask */ 817 cs->buf[cs->cdw++] = 4; /* poll interval */ 818#endif 819} 820