radeon_drm_cs.c revision 92af184690995d3b16731518f7becfaac3538edb
1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74/*
75 * this are copy from radeon_drm, once an updated libdrm is released
76 * we should bump configure.ac requirement for it and remove the following
77 * field
78 */
79#ifndef RADEON_CHUNK_ID_FLAGS
80#define RADEON_CHUNK_ID_FLAGS       0x03
81
82/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */
83#define RADEON_CS_KEEP_TILING_FLAGS 0x01
84
85
86#endif
87
88#ifndef RADEON_CS_USE_VM
89#define RADEON_CS_USE_VM            0x02
90/* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */
91#define RADEON_CS_RING_GFX          0
92#define RADEON_CS_RING_COMPUTE      1
93#endif
94
95
96#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
97
98static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
99                                      struct radeon_drm_winsys *ws)
100{
101    csc->fd = ws->fd;
102    csc->nrelocs = 512;
103    csc->relocs_bo = (struct radeon_bo**)
104                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
105    if (!csc->relocs_bo) {
106        return FALSE;
107    }
108
109    csc->relocs = (struct drm_radeon_cs_reloc*)
110                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
111    if (!csc->relocs) {
112        FREE(csc->relocs_bo);
113        return FALSE;
114    }
115
116    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117    csc->chunks[0].length_dw = 0;
118    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120    csc->chunks[1].length_dw = 0;
121    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123    csc->chunks[2].length_dw = 2;
124    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125
126    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129
130    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131    return TRUE;
132}
133
134static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
135{
136    unsigned i;
137
138    for (i = 0; i < csc->crelocs; i++) {
139        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
140        radeon_bo_reference(&csc->relocs_bo[i], NULL);
141    }
142
143    csc->crelocs = 0;
144    csc->validated_crelocs = 0;
145    csc->chunks[0].length_dw = 0;
146    csc->chunks[1].length_dw = 0;
147    csc->used_gart = 0;
148    csc->used_vram = 0;
149    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
150}
151
152static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
153{
154    radeon_cs_context_cleanup(csc);
155    FREE(csc->relocs_bo);
156    FREE(csc->relocs);
157}
158
159DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
160static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
161
162static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
163{
164    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
165    struct radeon_drm_cs *cs;
166
167    cs = CALLOC_STRUCT(radeon_drm_cs);
168    if (!cs) {
169        return NULL;
170    }
171    pipe_semaphore_init(&cs->flush_queued, 0);
172    pipe_semaphore_init(&cs->flush_completed, 0);
173
174    cs->ws = ws;
175
176    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
177        FREE(cs);
178        return NULL;
179    }
180    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
181        radeon_destroy_cs_context(&cs->csc1);
182        FREE(cs);
183        return NULL;
184    }
185
186    /* Set the first command buffer as current. */
187    cs->csc = &cs->csc1;
188    cs->cst = &cs->csc2;
189    cs->base.buf = cs->csc->buf;
190
191    p_atomic_inc(&ws->num_cs);
192    if (cs->ws->num_cpus > 1 && debug_get_option_thread())
193        cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
194    return &cs->base;
195}
196
197#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
198
199static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
200                                        enum radeon_bo_domain rd,
201                                        enum radeon_bo_domain wd,
202                                        enum radeon_bo_domain *added_domains)
203{
204    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
205
206    reloc->read_domains |= rd;
207    reloc->write_domain |= wd;
208}
209
210int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
211{
212    struct drm_radeon_cs_reloc *reloc;
213    unsigned i;
214    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
215
216    if (csc->is_handle_added[hash]) {
217        reloc = csc->relocs_hashlist[hash];
218        if (reloc->handle == bo->handle) {
219            return csc->reloc_indices_hashlist[hash];
220        }
221
222        /* Hash collision, look for the BO in the list of relocs linearly. */
223        for (i = csc->crelocs; i != 0;) {
224            --i;
225            reloc = &csc->relocs[i];
226            if (reloc->handle == bo->handle) {
227                /* Put this reloc in the hash list.
228                 * This will prevent additional hash collisions if there are
229                 * several consecutive get_reloc calls for the same buffer.
230                 *
231                 * Example: Assuming buffers A,B,C collide in the hash list,
232                 * the following sequence of relocs:
233                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
234                 * will collide here: ^ and here:   ^,
235                 * meaning that we should get very few collisions in the end. */
236                csc->relocs_hashlist[hash] = reloc;
237                csc->reloc_indices_hashlist[hash] = i;
238                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
239                return i;
240            }
241        }
242    }
243
244    return -1;
245}
246
247static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
248                                 struct radeon_bo *bo,
249                                 enum radeon_bo_usage usage,
250                                 enum radeon_bo_domain domains,
251                                 enum radeon_bo_domain *added_domains)
252{
253    struct drm_radeon_cs_reloc *reloc;
254    unsigned i;
255    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
256    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
257    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
258
259    if (csc->is_handle_added[hash]) {
260        reloc = csc->relocs_hashlist[hash];
261        if (reloc->handle == bo->handle) {
262            update_reloc_domains(reloc, rd, wd, added_domains);
263            return csc->reloc_indices_hashlist[hash];
264        }
265
266        /* Hash collision, look for the BO in the list of relocs linearly. */
267        for (i = csc->crelocs; i != 0;) {
268            --i;
269            reloc = &csc->relocs[i];
270            if (reloc->handle == bo->handle) {
271                update_reloc_domains(reloc, rd, wd, added_domains);
272
273                csc->relocs_hashlist[hash] = reloc;
274                csc->reloc_indices_hashlist[hash] = i;
275                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
276                return i;
277            }
278        }
279    }
280
281    /* New relocation, check if the backing array is large enough. */
282    if (csc->crelocs >= csc->nrelocs) {
283        uint32_t size;
284        csc->nrelocs += 10;
285
286        size = csc->nrelocs * sizeof(struct radeon_bo*);
287        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
288
289        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
290        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
291
292        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
293    }
294
295    /* Initialize the new relocation. */
296    csc->relocs_bo[csc->crelocs] = NULL;
297    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
298    p_atomic_inc(&bo->num_cs_references);
299    reloc = &csc->relocs[csc->crelocs];
300    reloc->handle = bo->handle;
301    reloc->read_domains = rd;
302    reloc->write_domain = wd;
303    reloc->flags = 0;
304
305    csc->is_handle_added[hash] = TRUE;
306    csc->relocs_hashlist[hash] = reloc;
307    csc->reloc_indices_hashlist[hash] = csc->crelocs;
308
309    csc->chunks[1].length_dw += RELOC_DWORDS;
310
311    *added_domains = rd | wd;
312    return csc->crelocs++;
313}
314
315static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
316                                        struct radeon_winsys_cs_handle *buf,
317                                        enum radeon_bo_usage usage,
318                                        enum radeon_bo_domain domains)
319{
320    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
321    struct radeon_bo *bo = (struct radeon_bo*)buf;
322    enum radeon_bo_domain added_domains;
323
324    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
325
326    if (added_domains & RADEON_DOMAIN_GTT)
327        cs->csc->used_gart += bo->base.size;
328    if (added_domains & RADEON_DOMAIN_VRAM)
329        cs->csc->used_vram += bo->base.size;
330
331    return index;
332}
333
334static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
335{
336    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
337    boolean status =
338        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
339        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
340
341    if (status) {
342        cs->csc->validated_crelocs = cs->csc->crelocs;
343    } else {
344        /* Remove lately-added relocations. The validation failed with them
345         * and the CS is about to be flushed because of that. Keep only
346         * the already-validated relocations. */
347        unsigned i;
348
349        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
350            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
351            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
352        }
353        cs->csc->crelocs = cs->csc->validated_crelocs;
354
355        /* Flush if there are any relocs. Clean up otherwise. */
356        if (cs->csc->crelocs) {
357            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
358        } else {
359            radeon_cs_context_cleanup(cs->csc);
360
361            assert(cs->base.cdw == 0);
362            if (cs->base.cdw != 0) {
363                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
364            }
365        }
366    }
367    return status;
368}
369
370static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
371                                      struct radeon_winsys_cs_handle *buf)
372{
373    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
374    struct radeon_bo *bo = (struct radeon_bo*)buf;
375
376    unsigned index = radeon_get_reloc(cs->csc, bo);
377
378    if (index == -1) {
379        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
380        return;
381    }
382
383    OUT_CS(&cs->base, 0xc0001000);
384    OUT_CS(&cs->base, index * RELOC_DWORDS);
385}
386
387static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
388{
389    unsigned i;
390
391    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
392                            &csc->cs, sizeof(struct drm_radeon_cs))) {
393        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
394            unsigned i;
395
396            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
397            for (i = 0; i < csc->chunks[0].length_dw; i++) {
398                fprintf(stderr, "0x%08X\n", csc->buf[i]);
399            }
400        } else {
401            fprintf(stderr, "radeon: The kernel rejected CS, "
402                    "see dmesg for more information.\n");
403        }
404    }
405
406    for (i = 0; i < csc->crelocs; i++)
407        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
408
409    radeon_cs_context_cleanup(csc);
410}
411
412static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
413{
414    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
415
416    while (1) {
417        pipe_semaphore_wait(&cs->flush_queued);
418        if (cs->kill_thread)
419            break;
420        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
421        pipe_semaphore_signal(&cs->flush_completed);
422    }
423    pipe_semaphore_signal(&cs->flush_completed);
424    return NULL;
425}
426
427void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
428{
429    /* Wait for any pending ioctl to complete. */
430    if (cs->thread && cs->flush_started) {
431        pipe_semaphore_wait(&cs->flush_completed);
432        cs->flush_started = 0;
433    }
434}
435
436static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
437{
438    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
439    struct radeon_cs_context *tmp;
440
441    radeon_drm_cs_sync_flush(cs);
442
443    /* Flip command streams. */
444    tmp = cs->csc;
445    cs->csc = cs->cst;
446    cs->cst = tmp;
447
448    /* If the CS is not empty, emit it in a separate thread. */
449    if (cs->base.cdw) {
450        unsigned i, crelocs = cs->cst->crelocs;
451
452        cs->cst->chunks[0].length_dw = cs->base.cdw;
453
454        for (i = 0; i < crelocs; i++) {
455            /* Update the number of active asynchronous CS ioctls for the buffer. */
456            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
457        }
458
459        cs->cst->flags[0] = 0;
460        cs->cst->flags[1] = RADEON_CS_RING_GFX;
461        cs->cst->cs.num_chunks = 2;
462        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
463            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
464            cs->cst->cs.num_chunks = 3;
465        }
466        if (cs->ws->info.r600_virtual_address) {
467            cs->cst->flags[0] |= RADEON_CS_USE_VM;
468            cs->cst->cs.num_chunks = 3;
469        }
470        if (flags & RADEON_FLUSH_COMPUTE) {
471            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
472            cs->cst->cs.num_chunks = 3;
473        }
474
475        if (cs->thread &&
476            (flags & RADEON_FLUSH_ASYNC)) {
477            cs->flush_started = 1;
478            pipe_semaphore_signal(&cs->flush_queued);
479        } else {
480            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
481        }
482    } else {
483        radeon_cs_context_cleanup(cs->cst);
484    }
485
486    /* Prepare a new CS. */
487    cs->base.buf = cs->csc->buf;
488    cs->base.cdw = 0;
489}
490
491static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
492{
493    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
494    radeon_drm_cs_sync_flush(cs);
495    if (cs->thread) {
496        cs->kill_thread = 1;
497        pipe_semaphore_signal(&cs->flush_queued);
498        pipe_semaphore_wait(&cs->flush_completed);
499        pipe_thread_wait(cs->thread);
500    }
501    pipe_semaphore_destroy(&cs->flush_queued);
502    pipe_semaphore_destroy(&cs->flush_completed);
503    radeon_cs_context_cleanup(&cs->csc1);
504    radeon_cs_context_cleanup(&cs->csc2);
505    p_atomic_dec(&cs->ws->num_cs);
506    radeon_destroy_cs_context(&cs->csc1);
507    radeon_destroy_cs_context(&cs->csc2);
508    FREE(cs);
509}
510
511static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
512                                    void (*flush)(void *ctx, unsigned flags),
513                                    void *user)
514{
515    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
516    cs->flush_cs = flush;
517    cs->flush_data = user;
518}
519
520static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
521                                       struct radeon_winsys_cs_handle *_buf,
522                                       enum radeon_bo_usage usage)
523{
524    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
525    struct radeon_bo *bo = (struct radeon_bo*)_buf;
526    int index;
527
528    if (!bo->num_cs_references)
529        return FALSE;
530
531    index = radeon_get_reloc(cs->csc, bo);
532    if (index == -1)
533        return FALSE;
534
535    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
536        return TRUE;
537    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
538        return TRUE;
539
540    return FALSE;
541}
542
543void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
544{
545    ws->base.cs_create = radeon_drm_cs_create;
546    ws->base.cs_destroy = radeon_drm_cs_destroy;
547    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
548    ws->base.cs_validate = radeon_drm_cs_validate;
549    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
550    ws->base.cs_flush = radeon_drm_cs_flush;
551    ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
552    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
553}
554