radeon_drm_cs.c revision a7f4d3b740d4c85b0dc2b006c30c2bc4a3ed8597
1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74/*
75 * this are copy from radeon_drm, once an updated libdrm is released
76 * we should bump configure.ac requirement for it and remove the following
77 * field
78 */
79#ifndef RADEON_CHUNK_ID_FLAGS
80#define RADEON_CHUNK_ID_FLAGS       0x03
81
82/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */
83#define RADEON_CS_KEEP_TILING_FLAGS 0x01
84
85
86#endif
87
88#ifndef RADEON_CS_USE_VM
89#define RADEON_CS_USE_VM            0x02
90/* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */
91#define RADEON_CS_RING_GFX          0
92#define RADEON_CS_RING_COMPUTE      1
93#endif
94
95
96#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
97
98static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
99                                      struct radeon_drm_winsys *ws)
100{
101    csc->fd = ws->fd;
102    csc->nrelocs = 512;
103    csc->relocs_bo = (struct radeon_bo**)
104                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
105    if (!csc->relocs_bo) {
106        return FALSE;
107    }
108
109    csc->relocs = (struct drm_radeon_cs_reloc*)
110                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
111    if (!csc->relocs) {
112        FREE(csc->relocs_bo);
113        return FALSE;
114    }
115
116    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117    csc->chunks[0].length_dw = 0;
118    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120    csc->chunks[1].length_dw = 0;
121    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123    csc->chunks[2].length_dw = 2;
124    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125
126    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129
130    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131    return TRUE;
132}
133
134static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
135{
136    unsigned i;
137
138    for (i = 0; i < csc->crelocs; i++) {
139        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
140        radeon_bo_reference(&csc->relocs_bo[i], NULL);
141    }
142
143    csc->crelocs = 0;
144    csc->validated_crelocs = 0;
145    csc->chunks[0].length_dw = 0;
146    csc->chunks[1].length_dw = 0;
147    csc->used_gart = 0;
148    csc->used_vram = 0;
149    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
150}
151
152static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
153{
154    radeon_cs_context_cleanup(csc);
155    FREE(csc->relocs_bo);
156    FREE(csc->relocs);
157}
158
159DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
160static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
161
162static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
163{
164    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
165    struct radeon_drm_cs *cs;
166
167    cs = CALLOC_STRUCT(radeon_drm_cs);
168    if (!cs) {
169        return NULL;
170    }
171    pipe_semaphore_init(&cs->flush_queued, 0);
172    pipe_semaphore_init(&cs->flush_completed, 0);
173
174    cs->ws = ws;
175
176    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
177        FREE(cs);
178        return NULL;
179    }
180    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
181        radeon_destroy_cs_context(&cs->csc1);
182        FREE(cs);
183        return NULL;
184    }
185
186    /* Set the first command buffer as current. */
187    cs->csc = &cs->csc1;
188    cs->cst = &cs->csc2;
189    cs->base.buf = cs->csc->buf;
190
191    p_atomic_inc(&ws->num_cs);
192    if (cs->ws->num_cpus > 1 && debug_get_option_thread())
193        cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
194    return &cs->base;
195}
196
197#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
198
199static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
200                                        enum radeon_bo_domain rd,
201                                        enum radeon_bo_domain wd,
202                                        enum radeon_bo_domain *added_domains)
203{
204    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
205
206    reloc->read_domains |= rd;
207    reloc->write_domain |= wd;
208}
209
210int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
211{
212    struct drm_radeon_cs_reloc *reloc;
213    unsigned i;
214    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
215
216    if (csc->is_handle_added[hash]) {
217        reloc = csc->relocs_hashlist[hash];
218        if (reloc->handle == bo->handle) {
219            return csc->reloc_indices_hashlist[hash];
220        }
221
222        /* Hash collision, look for the BO in the list of relocs linearly. */
223        for (i = csc->crelocs; i != 0;) {
224            --i;
225            reloc = &csc->relocs[i];
226            if (reloc->handle == bo->handle) {
227                /* Put this reloc in the hash list.
228                 * This will prevent additional hash collisions if there are
229                 * several consecutive get_reloc calls for the same buffer.
230                 *
231                 * Example: Assuming buffers A,B,C collide in the hash list,
232                 * the following sequence of relocs:
233                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
234                 * will collide here: ^ and here:   ^,
235                 * meaning that we should get very few collisions in the end. */
236                csc->relocs_hashlist[hash] = reloc;
237                csc->reloc_indices_hashlist[hash] = i;
238                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
239                return i;
240            }
241        }
242    }
243
244    return -1;
245}
246
247static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
248                                 struct radeon_bo *bo,
249                                 enum radeon_bo_usage usage,
250                                 enum radeon_bo_domain domains,
251                                 enum radeon_bo_domain *added_domains)
252{
253    struct drm_radeon_cs_reloc *reloc;
254    unsigned i;
255    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
256    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
257    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
258
259    if (csc->is_handle_added[hash]) {
260        reloc = csc->relocs_hashlist[hash];
261        if (reloc->handle == bo->handle) {
262            update_reloc_domains(reloc, rd, wd, added_domains);
263            return csc->reloc_indices_hashlist[hash];
264        }
265
266        /* Hash collision, look for the BO in the list of relocs linearly. */
267        for (i = csc->crelocs; i != 0;) {
268            --i;
269            reloc = &csc->relocs[i];
270            if (reloc->handle == bo->handle) {
271                update_reloc_domains(reloc, rd, wd, added_domains);
272
273                csc->relocs_hashlist[hash] = reloc;
274                csc->reloc_indices_hashlist[hash] = i;
275                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
276                return i;
277            }
278        }
279    }
280
281    /* New relocation, check if the backing array is large enough. */
282    if (csc->crelocs >= csc->nrelocs) {
283        uint32_t size;
284        csc->nrelocs += 10;
285
286        size = csc->nrelocs * sizeof(struct radeon_bo*);
287        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
288
289        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
290        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
291
292        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
293    }
294
295    /* Initialize the new relocation. */
296    csc->relocs_bo[csc->crelocs] = NULL;
297    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
298    p_atomic_inc(&bo->num_cs_references);
299    reloc = &csc->relocs[csc->crelocs];
300    reloc->handle = bo->handle;
301    reloc->read_domains = rd;
302    reloc->write_domain = wd;
303    reloc->flags = 0;
304
305    csc->is_handle_added[hash] = TRUE;
306    csc->relocs_hashlist[hash] = reloc;
307    csc->reloc_indices_hashlist[hash] = csc->crelocs;
308
309    csc->chunks[1].length_dw += RELOC_DWORDS;
310
311    *added_domains = rd | wd;
312    return csc->crelocs++;
313}
314
315static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
316                                        struct radeon_winsys_cs_handle *buf,
317                                        enum radeon_bo_usage usage,
318                                        enum radeon_bo_domain domains)
319{
320    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
321    struct radeon_bo *bo = (struct radeon_bo*)buf;
322    enum radeon_bo_domain added_domains;
323
324    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
325
326    if (added_domains & RADEON_DOMAIN_GTT)
327        cs->csc->used_gart += bo->base.size;
328    if (added_domains & RADEON_DOMAIN_VRAM)
329        cs->csc->used_vram += bo->base.size;
330
331    return index;
332}
333
334static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
335{
336    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
337    boolean status =
338        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
339        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
340
341    if (status) {
342        cs->csc->validated_crelocs = cs->csc->crelocs;
343    } else {
344        /* Remove lately-added relocations. The validation failed with them
345         * and the CS is about to be flushed because of that. Keep only
346         * the already-validated relocations. */
347        unsigned i;
348
349        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
350            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
351            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
352        }
353        cs->csc->crelocs = cs->csc->validated_crelocs;
354
355        /* Flush if there are any relocs. Clean up otherwise. */
356        if (cs->csc->crelocs) {
357            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
358        } else {
359            radeon_cs_context_cleanup(cs->csc);
360
361            assert(cs->base.cdw == 0);
362            if (cs->base.cdw != 0) {
363                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
364            }
365        }
366    }
367    return status;
368}
369
370static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
371                                      struct radeon_winsys_cs_handle *buf)
372{
373    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
374    struct radeon_bo *bo = (struct radeon_bo*)buf;
375
376    unsigned index = radeon_get_reloc(cs->csc, bo);
377
378    if (index == -1) {
379        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
380        return;
381    }
382
383    OUT_CS(&cs->base, 0xc0001000);
384    OUT_CS(&cs->base, index * RELOC_DWORDS);
385}
386
387static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
388{
389    unsigned i;
390
391    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
392                            &csc->cs, sizeof(struct drm_radeon_cs))) {
393        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
394            unsigned i;
395
396            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
397            for (i = 0; i < csc->chunks[0].length_dw; i++) {
398                fprintf(stderr, "0x%08X\n", csc->buf[i]);
399            }
400        } else {
401            fprintf(stderr, "radeon: The kernel rejected CS, "
402                    "see dmesg for more information.\n");
403        }
404    }
405
406    for (i = 0; i < csc->crelocs; i++)
407        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
408
409    radeon_cs_context_cleanup(csc);
410}
411
412static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
413{
414    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
415
416    while (1) {
417        pipe_semaphore_wait(&cs->flush_queued);
418        if (cs->kill_thread)
419            break;
420        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
421        pipe_semaphore_signal(&cs->flush_completed);
422    }
423    pipe_semaphore_signal(&cs->flush_completed);
424    return NULL;
425}
426
427void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
428{
429    /* Wait for any pending ioctl to complete. */
430    if (cs->thread && cs->flush_started) {
431        pipe_semaphore_wait(&cs->flush_completed);
432        cs->flush_started = 0;
433    }
434}
435
436static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
437{
438    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
439    struct radeon_cs_context *tmp;
440
441    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
442       fprintf(stderr, "radeon: command stream overflowed\n");
443    }
444
445    radeon_drm_cs_sync_flush(cs);
446
447    /* Flip command streams. */
448    tmp = cs->csc;
449    cs->csc = cs->cst;
450    cs->cst = tmp;
451
452    /* If the CS is not empty or overflowed, emit it in a separate thread. */
453    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS) {
454        unsigned i, crelocs = cs->cst->crelocs;
455
456        cs->cst->chunks[0].length_dw = cs->base.cdw;
457
458        for (i = 0; i < crelocs; i++) {
459            /* Update the number of active asynchronous CS ioctls for the buffer. */
460            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
461        }
462
463        cs->cst->flags[0] = 0;
464        cs->cst->flags[1] = RADEON_CS_RING_GFX;
465        cs->cst->cs.num_chunks = 2;
466        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
467            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
468            cs->cst->cs.num_chunks = 3;
469        }
470        if (cs->ws->info.r600_virtual_address) {
471            cs->cst->flags[0] |= RADEON_CS_USE_VM;
472            cs->cst->cs.num_chunks = 3;
473        }
474        if (flags & RADEON_FLUSH_COMPUTE) {
475            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
476            cs->cst->cs.num_chunks = 3;
477        }
478
479        if (cs->thread &&
480            (flags & RADEON_FLUSH_ASYNC)) {
481            cs->flush_started = 1;
482            pipe_semaphore_signal(&cs->flush_queued);
483        } else {
484            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
485        }
486    } else {
487        radeon_cs_context_cleanup(cs->cst);
488    }
489
490    /* Prepare a new CS. */
491    cs->base.buf = cs->csc->buf;
492    cs->base.cdw = 0;
493}
494
495static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
496{
497    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
498    radeon_drm_cs_sync_flush(cs);
499    if (cs->thread) {
500        cs->kill_thread = 1;
501        pipe_semaphore_signal(&cs->flush_queued);
502        pipe_semaphore_wait(&cs->flush_completed);
503        pipe_thread_wait(cs->thread);
504    }
505    pipe_semaphore_destroy(&cs->flush_queued);
506    pipe_semaphore_destroy(&cs->flush_completed);
507    radeon_cs_context_cleanup(&cs->csc1);
508    radeon_cs_context_cleanup(&cs->csc2);
509    p_atomic_dec(&cs->ws->num_cs);
510    radeon_destroy_cs_context(&cs->csc1);
511    radeon_destroy_cs_context(&cs->csc2);
512    FREE(cs);
513}
514
515static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
516                                    void (*flush)(void *ctx, unsigned flags),
517                                    void *user)
518{
519    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
520    cs->flush_cs = flush;
521    cs->flush_data = user;
522}
523
524static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
525                                       struct radeon_winsys_cs_handle *_buf,
526                                       enum radeon_bo_usage usage)
527{
528    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
529    struct radeon_bo *bo = (struct radeon_bo*)_buf;
530    int index;
531
532    if (!bo->num_cs_references)
533        return FALSE;
534
535    index = radeon_get_reloc(cs->csc, bo);
536    if (index == -1)
537        return FALSE;
538
539    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
540        return TRUE;
541    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
542        return TRUE;
543
544    return FALSE;
545}
546
547void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
548{
549    ws->base.cs_create = radeon_drm_cs_create;
550    ws->base.cs_destroy = radeon_drm_cs_destroy;
551    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
552    ws->base.cs_validate = radeon_drm_cs_validate;
553    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
554    ws->base.cs_flush = radeon_drm_cs_flush;
555    ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
556    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
557}
558