radeon_drm_cs.c revision 12aeb47b6af4b3100da26b3ab72ef93886479219
1/* 2 * Copyright © 2008 Jérôme Glisse 3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com> 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS 18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * The above copyright notice and this permission notice (including the 24 * next paragraph) shall be included in all copies or substantial portions 25 * of the Software. 26 */ 27/* 28 * Authors: 29 * Marek Olšák <maraeo@gmail.com> 30 * 31 * Based on work from libdrm_radeon by: 32 * Aapo Tahkola <aet@rasterburn.org> 33 * Nicolai Haehnle <prefect_@gmx.net> 34 * Jérôme Glisse <glisse@freedesktop.org> 35 */ 36 37/* 38 This file replaces libdrm's radeon_cs_gem with our own implemention. 39 It's optimized specifically for Radeon DRM. 40 Reloc writes and space checking are faster and simpler than their 41 counterparts in libdrm (the time complexity of all the functions 42 is O(1) in nearly all scenarios, thanks to hashing). 43 44 It works like this: 45 46 cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and 47 also adds the size of 'buf' to the used_gart and used_vram winsys variables 48 based on the domains, which are simply or'd for the accounting purposes. 49 The adding is skipped if the reloc is already present in the list, but it 50 accounts any newly-referenced domains. 51 52 cs_validate is then called, which just checks: 53 used_vram/gart < vram/gart_size * 0.8 54 The 0.8 number allows for some memory fragmentation. If the validation 55 fails, the pipe driver flushes CS and tries do the validation again, 56 i.e. it validates only that one operation. If it fails again, it drops 57 the operation on the floor and prints some nasty message to stderr. 58 (done in the pipe driver) 59 60 cs_write_reloc(cs, buf) just writes a reloc that has been added using 61 cs_add_reloc. The read_domain and write_domain parameters have been removed, 62 because we already specify them in cs_add_reloc. 63*/ 64 65#include "radeon_drm_cs.h" 66 67#include "util/u_memory.h" 68 69#include <stdio.h> 70#include <stdlib.h> 71#include <stdint.h> 72#include <xf86drm.h> 73 74/* 75 * this are copy from radeon_drm, once an updated libdrm is released 76 * we should bump configure.ac requirement for it and remove the following 77 * field 78 */ 79#ifndef RADEON_CHUNK_ID_FLAGS 80#define RADEON_CHUNK_ID_FLAGS 0x03 81 82/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */ 83#define RADEON_CS_KEEP_TILING_FLAGS 0x01 84#endif 85 86#ifndef RADEON_CS_USE_VM 87#define RADEON_CS_USE_VM 0x02 88/* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */ 89#define RADEON_CS_RING_GFX 0 90#define RADEON_CS_RING_COMPUTE 1 91#endif 92 93#ifndef RADEON_CS_END_OF_FRAME 94#define RADEON_CS_END_OF_FRAME 0x04 95#endif 96 97 98#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t)) 99 100static boolean radeon_init_cs_context(struct radeon_cs_context *csc, 101 struct radeon_drm_winsys *ws) 102{ 103 csc->fd = ws->fd; 104 csc->nrelocs = 512; 105 csc->relocs_bo = (struct radeon_bo**) 106 CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*)); 107 if (!csc->relocs_bo) { 108 return FALSE; 109 } 110 111 csc->relocs = (struct drm_radeon_cs_reloc*) 112 CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc)); 113 if (!csc->relocs) { 114 FREE(csc->relocs_bo); 115 return FALSE; 116 } 117 118 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB; 119 csc->chunks[0].length_dw = 0; 120 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf; 121 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS; 122 csc->chunks[1].length_dw = 0; 123 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; 124 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS; 125 csc->chunks[2].length_dw = 2; 126 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags; 127 128 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0]; 129 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1]; 130 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2]; 131 132 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array; 133 return TRUE; 134} 135 136static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) 137{ 138 unsigned i; 139 140 for (i = 0; i < csc->crelocs; i++) { 141 p_atomic_dec(&csc->relocs_bo[i]->num_cs_references); 142 radeon_bo_reference(&csc->relocs_bo[i], NULL); 143 } 144 145 csc->crelocs = 0; 146 csc->validated_crelocs = 0; 147 csc->chunks[0].length_dw = 0; 148 csc->chunks[1].length_dw = 0; 149 csc->used_gart = 0; 150 csc->used_vram = 0; 151 memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added)); 152} 153 154static void radeon_destroy_cs_context(struct radeon_cs_context *csc) 155{ 156 radeon_cs_context_cleanup(csc); 157 FREE(csc->relocs_bo); 158 FREE(csc->relocs); 159} 160 161DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE) 162static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param); 163 164static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws) 165{ 166 struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); 167 struct radeon_drm_cs *cs; 168 169 cs = CALLOC_STRUCT(radeon_drm_cs); 170 if (!cs) { 171 return NULL; 172 } 173 pipe_semaphore_init(&cs->flush_queued, 0); 174 pipe_semaphore_init(&cs->flush_completed, 0); 175 176 cs->ws = ws; 177 178 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) { 179 FREE(cs); 180 return NULL; 181 } 182 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) { 183 radeon_destroy_cs_context(&cs->csc1); 184 FREE(cs); 185 return NULL; 186 } 187 188 /* Set the first command buffer as current. */ 189 cs->csc = &cs->csc1; 190 cs->cst = &cs->csc2; 191 cs->base.buf = cs->csc->buf; 192 193 p_atomic_inc(&ws->num_cs); 194 if (cs->ws->num_cpus > 1 && debug_get_option_thread()) 195 cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs); 196 return &cs->base; 197} 198 199#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value) 200 201static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc, 202 enum radeon_bo_domain rd, 203 enum radeon_bo_domain wd, 204 enum radeon_bo_domain *added_domains) 205{ 206 *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain); 207 208 reloc->read_domains |= rd; 209 reloc->write_domain |= wd; 210} 211 212int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) 213{ 214 struct drm_radeon_cs_reloc *reloc; 215 unsigned i; 216 unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); 217 218 if (csc->is_handle_added[hash]) { 219 i = csc->reloc_indices_hashlist[hash]; 220 reloc = &csc->relocs[i]; 221 if (reloc->handle == bo->handle) { 222 return i; 223 } 224 225 /* Hash collision, look for the BO in the list of relocs linearly. */ 226 for (i = csc->crelocs; i != 0;) { 227 --i; 228 reloc = &csc->relocs[i]; 229 if (reloc->handle == bo->handle) { 230 /* Put this reloc in the hash list. 231 * This will prevent additional hash collisions if there are 232 * several consecutive get_reloc calls for the same buffer. 233 * 234 * Example: Assuming buffers A,B,C collide in the hash list, 235 * the following sequence of relocs: 236 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC 237 * will collide here: ^ and here: ^, 238 * meaning that we should get very few collisions in the end. */ 239 csc->reloc_indices_hashlist[hash] = i; 240 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ 241 return i; 242 } 243 } 244 } 245 246 return -1; 247} 248 249static unsigned radeon_add_reloc(struct radeon_cs_context *csc, 250 struct radeon_bo *bo, 251 enum radeon_bo_usage usage, 252 enum radeon_bo_domain domains, 253 enum radeon_bo_domain *added_domains) 254{ 255 struct drm_radeon_cs_reloc *reloc; 256 unsigned i; 257 unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); 258 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0; 259 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0; 260 261 if (csc->is_handle_added[hash]) { 262 i = csc->reloc_indices_hashlist[hash]; 263 reloc = &csc->relocs[i]; 264 if (reloc->handle == bo->handle) { 265 update_reloc_domains(reloc, rd, wd, added_domains); 266 return i; 267 } 268 269 /* Hash collision, look for the BO in the list of relocs linearly. */ 270 for (i = csc->crelocs; i != 0;) { 271 --i; 272 reloc = &csc->relocs[i]; 273 if (reloc->handle == bo->handle) { 274 update_reloc_domains(reloc, rd, wd, added_domains); 275 276 csc->reloc_indices_hashlist[hash] = i; 277 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ 278 return i; 279 } 280 } 281 } 282 283 /* New relocation, check if the backing array is large enough. */ 284 if (csc->crelocs >= csc->nrelocs) { 285 uint32_t size; 286 csc->nrelocs += 10; 287 288 size = csc->nrelocs * sizeof(struct radeon_bo*); 289 csc->relocs_bo = realloc(csc->relocs_bo, size); 290 291 size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); 292 csc->relocs = realloc(csc->relocs, size); 293 294 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; 295 } 296 297 /* Initialize the new relocation. */ 298 csc->relocs_bo[csc->crelocs] = NULL; 299 radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo); 300 p_atomic_inc(&bo->num_cs_references); 301 reloc = &csc->relocs[csc->crelocs]; 302 reloc->handle = bo->handle; 303 reloc->read_domains = rd; 304 reloc->write_domain = wd; 305 reloc->flags = 0; 306 307 csc->is_handle_added[hash] = TRUE; 308 csc->reloc_indices_hashlist[hash] = csc->crelocs; 309 310 csc->chunks[1].length_dw += RELOC_DWORDS; 311 312 *added_domains = rd | wd; 313 return csc->crelocs++; 314} 315 316static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs, 317 struct radeon_winsys_cs_handle *buf, 318 enum radeon_bo_usage usage, 319 enum radeon_bo_domain domains) 320{ 321 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 322 struct radeon_bo *bo = (struct radeon_bo*)buf; 323 enum radeon_bo_domain added_domains; 324 325 unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains); 326 327 if (added_domains & RADEON_DOMAIN_GTT) 328 cs->csc->used_gart += bo->base.size; 329 if (added_domains & RADEON_DOMAIN_VRAM) 330 cs->csc->used_vram += bo->base.size; 331 332 return index; 333} 334 335static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs) 336{ 337 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 338 boolean status = 339 cs->csc->used_gart < cs->ws->info.gart_size * 0.8 && 340 cs->csc->used_vram < cs->ws->info.vram_size * 0.8; 341 342 if (status) { 343 cs->csc->validated_crelocs = cs->csc->crelocs; 344 } else { 345 /* Remove lately-added relocations. The validation failed with them 346 * and the CS is about to be flushed because of that. Keep only 347 * the already-validated relocations. */ 348 unsigned i; 349 350 for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) { 351 p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references); 352 radeon_bo_reference(&cs->csc->relocs_bo[i], NULL); 353 } 354 cs->csc->crelocs = cs->csc->validated_crelocs; 355 356 /* Flush if there are any relocs. Clean up otherwise. */ 357 if (cs->csc->crelocs) { 358 cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC); 359 } else { 360 radeon_cs_context_cleanup(cs->csc); 361 362 assert(cs->base.cdw == 0); 363 if (cs->base.cdw != 0) { 364 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__); 365 } 366 } 367 } 368 return status; 369} 370 371static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs, 372 struct radeon_winsys_cs_handle *buf) 373{ 374 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 375 struct radeon_bo *bo = (struct radeon_bo*)buf; 376 377 unsigned index = radeon_get_reloc(cs->csc, bo); 378 379 if (index == -1) { 380 fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__); 381 return; 382 } 383 384 OUT_CS(&cs->base, 0xc0001000); 385 OUT_CS(&cs->base, index * RELOC_DWORDS); 386} 387 388static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc) 389{ 390 unsigned i; 391 392 if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS, 393 &csc->cs, sizeof(struct drm_radeon_cs))) { 394 if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) { 395 unsigned i; 396 397 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n"); 398 for (i = 0; i < csc->chunks[0].length_dw; i++) { 399 fprintf(stderr, "0x%08X\n", csc->buf[i]); 400 } 401 } else { 402 fprintf(stderr, "radeon: The kernel rejected CS, " 403 "see dmesg for more information.\n"); 404 } 405 } 406 407 for (i = 0; i < csc->crelocs; i++) 408 p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls); 409 410 radeon_cs_context_cleanup(csc); 411} 412 413static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param) 414{ 415 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param; 416 417 while (1) { 418 pipe_semaphore_wait(&cs->flush_queued); 419 if (cs->kill_thread) 420 break; 421 radeon_drm_cs_emit_ioctl_oneshot(cs->cst); 422 pipe_semaphore_signal(&cs->flush_completed); 423 } 424 pipe_semaphore_signal(&cs->flush_completed); 425 return NULL; 426} 427 428void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs) 429{ 430 /* Wait for any pending ioctl to complete. */ 431 if (cs->thread && cs->flush_started) { 432 pipe_semaphore_wait(&cs->flush_completed); 433 cs->flush_started = 0; 434 } 435} 436 437DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE) 438 439static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags) 440{ 441 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 442 struct radeon_cs_context *tmp; 443 444 if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) { 445 fprintf(stderr, "radeon: command stream overflowed\n"); 446 } 447 448 radeon_drm_cs_sync_flush(cs); 449 450 /* Flip command streams. */ 451 tmp = cs->csc; 452 cs->csc = cs->cst; 453 cs->cst = tmp; 454 455 /* If the CS is not empty or overflowed, emit it in a separate thread. */ 456 if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS && 457 !debug_get_option_noop()) { 458 unsigned i, crelocs = cs->cst->crelocs; 459 460 cs->cst->chunks[0].length_dw = cs->base.cdw; 461 462 for (i = 0; i < crelocs; i++) { 463 /* Update the number of active asynchronous CS ioctls for the buffer. */ 464 p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls); 465 } 466 467 cs->cst->flags[0] = 0; 468 cs->cst->flags[1] = RADEON_CS_RING_GFX; 469 cs->cst->cs.num_chunks = 2; 470 if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) { 471 cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS; 472 cs->cst->cs.num_chunks = 3; 473 } 474 if (cs->ws->info.r600_virtual_address) { 475 cs->cst->flags[0] |= RADEON_CS_USE_VM; 476 cs->cst->cs.num_chunks = 3; 477 } 478 if (flags & RADEON_FLUSH_END_OF_FRAME) { 479 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME; 480 cs->cst->cs.num_chunks = 3; 481 } 482 if (flags & RADEON_FLUSH_COMPUTE) { 483 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE; 484 cs->cst->cs.num_chunks = 3; 485 } 486 487 if (cs->thread && 488 (flags & RADEON_FLUSH_ASYNC)) { 489 cs->flush_started = 1; 490 pipe_semaphore_signal(&cs->flush_queued); 491 } else { 492 radeon_drm_cs_emit_ioctl_oneshot(cs->cst); 493 } 494 } else { 495 radeon_cs_context_cleanup(cs->cst); 496 } 497 498 /* Prepare a new CS. */ 499 cs->base.buf = cs->csc->buf; 500 cs->base.cdw = 0; 501} 502 503static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs) 504{ 505 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 506 radeon_drm_cs_sync_flush(cs); 507 if (cs->thread) { 508 cs->kill_thread = 1; 509 pipe_semaphore_signal(&cs->flush_queued); 510 pipe_semaphore_wait(&cs->flush_completed); 511 pipe_thread_wait(cs->thread); 512 } 513 pipe_semaphore_destroy(&cs->flush_queued); 514 pipe_semaphore_destroy(&cs->flush_completed); 515 radeon_cs_context_cleanup(&cs->csc1); 516 radeon_cs_context_cleanup(&cs->csc2); 517 p_atomic_dec(&cs->ws->num_cs); 518 radeon_destroy_cs_context(&cs->csc1); 519 radeon_destroy_cs_context(&cs->csc2); 520 FREE(cs); 521} 522 523static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs, 524 void (*flush)(void *ctx, unsigned flags), 525 void *user) 526{ 527 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 528 cs->flush_cs = flush; 529 cs->flush_data = user; 530} 531 532static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs, 533 struct radeon_winsys_cs_handle *_buf, 534 enum radeon_bo_usage usage) 535{ 536 struct radeon_drm_cs *cs = radeon_drm_cs(rcs); 537 struct radeon_bo *bo = (struct radeon_bo*)_buf; 538 int index; 539 540 if (!bo->num_cs_references) 541 return FALSE; 542 543 index = radeon_get_reloc(cs->csc, bo); 544 if (index == -1) 545 return FALSE; 546 547 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain) 548 return TRUE; 549 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains) 550 return TRUE; 551 552 return FALSE; 553} 554 555void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws) 556{ 557 ws->base.cs_create = radeon_drm_cs_create; 558 ws->base.cs_destroy = radeon_drm_cs_destroy; 559 ws->base.cs_add_reloc = radeon_drm_cs_add_reloc; 560 ws->base.cs_validate = radeon_drm_cs_validate; 561 ws->base.cs_write_reloc = radeon_drm_cs_write_reloc; 562 ws->base.cs_flush = radeon_drm_cs_flush; 563 ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush; 564 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced; 565} 566