radeon_drm_cs.c revision e6fb62594fca714883af9bba9795be8838c16900
1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
75
76static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd)
77{
78    csc->fd = fd;
79    csc->nrelocs = 512;
80    csc->relocs_bo = (struct radeon_bo**)
81                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
82    if (!csc->relocs_bo) {
83        return FALSE;
84    }
85
86    csc->relocs = (struct drm_radeon_cs_reloc*)
87                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
88    if (!csc->relocs) {
89        FREE(csc->relocs_bo);
90        return FALSE;
91    }
92
93    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
94    csc->chunks[0].length_dw = 0;
95    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
96    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
97    csc->chunks[1].length_dw = 0;
98    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
99
100    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
101    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
102
103    csc->cs.num_chunks = 2;
104    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
105    return TRUE;
106}
107
108static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
109{
110    unsigned i;
111
112    for (i = 0; i < csc->crelocs; i++) {
113        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
114        radeon_bo_reference(&csc->relocs_bo[i], NULL);
115    }
116
117    csc->crelocs = 0;
118    csc->validated_crelocs = 0;
119    csc->chunks[0].length_dw = 0;
120    csc->chunks[1].length_dw = 0;
121    csc->used_gart = 0;
122    csc->used_vram = 0;
123    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
124}
125
126static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
127{
128    radeon_cs_context_cleanup(csc);
129    FREE(csc->relocs_bo);
130    FREE(csc->relocs);
131}
132
133static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
134{
135    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
136    struct radeon_drm_cs *cs;
137
138    cs = CALLOC_STRUCT(radeon_drm_cs);
139    if (!cs) {
140        return NULL;
141    }
142
143    cs->ws = ws;
144
145    if (!radeon_init_cs_context(&cs->csc1, cs->ws->fd)) {
146        FREE(cs);
147        return NULL;
148    }
149    if (!radeon_init_cs_context(&cs->csc2, cs->ws->fd)) {
150        radeon_destroy_cs_context(&cs->csc1);
151        FREE(cs);
152        return NULL;
153    }
154
155    /* Set the first command buffer as current. */
156    cs->csc = &cs->csc1;
157    cs->cst = &cs->csc2;
158    cs->base.buf = cs->csc->buf;
159
160    p_atomic_inc(&ws->num_cs);
161    return &cs->base;
162}
163
164#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
165
166static INLINE void update_domains(struct drm_radeon_cs_reloc *reloc,
167                                  enum radeon_bo_domain rd,
168                                  enum radeon_bo_domain wd,
169                                  enum radeon_bo_domain *added_domains)
170{
171    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
172
173    if (reloc->read_domains & wd) {
174        reloc->read_domains = rd;
175        reloc->write_domain = wd;
176    } else if (rd & reloc->write_domain) {
177        reloc->read_domains = rd;
178        reloc->write_domain |= wd;
179    } else {
180        reloc->read_domains |= rd;
181        reloc->write_domain |= wd;
182    }
183}
184
185int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
186{
187    struct drm_radeon_cs_reloc *reloc;
188    unsigned i;
189    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
190
191    if (csc->is_handle_added[hash]) {
192        reloc = csc->relocs_hashlist[hash];
193        if (reloc->handle == bo->handle) {
194            return csc->reloc_indices_hashlist[hash];
195        }
196
197        /* Hash collision, look for the BO in the list of relocs linearly. */
198        for (i = csc->crelocs; i != 0;) {
199            --i;
200            reloc = &csc->relocs[i];
201            if (reloc->handle == bo->handle) {
202                /* Put this reloc in the hash list.
203                 * This will prevent additional hash collisions if there are
204                 * several subsequent get_reloc calls of the same buffer.
205                 *
206                 * Example: Assuming buffers A,B,C collide in the hash list,
207                 * the following sequence of relocs:
208                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
209                 * will collide here: ^ and here:   ^,
210                 * meaning that we should get very few collisions in the end. */
211                csc->relocs_hashlist[hash] = reloc;
212                csc->reloc_indices_hashlist[hash] = i;
213                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
214                return i;
215            }
216        }
217    }
218
219    return -1;
220}
221
222static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
223                                 struct radeon_bo *bo,
224                                 enum radeon_bo_domain rd,
225                                 enum radeon_bo_domain wd,
226                                 enum radeon_bo_domain *added_domains)
227{
228    struct drm_radeon_cs_reloc *reloc;
229    unsigned i;
230    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
231
232    if (csc->is_handle_added[hash]) {
233        reloc = csc->relocs_hashlist[hash];
234        if (reloc->handle == bo->handle) {
235            update_domains(reloc, rd, wd, added_domains);
236            return csc->reloc_indices_hashlist[hash];
237        }
238
239        /* Hash collision, look for the BO in the list of relocs linearly. */
240        for (i = csc->crelocs; i != 0;) {
241            --i;
242            reloc = &csc->relocs[i];
243            if (reloc->handle == bo->handle) {
244                update_domains(reloc, rd, wd, added_domains);
245
246                csc->relocs_hashlist[hash] = reloc;
247                csc->reloc_indices_hashlist[hash] = i;
248                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
249                return i;
250            }
251        }
252    }
253
254    /* New relocation, check if the backing array is large enough. */
255    if (csc->crelocs >= csc->nrelocs) {
256        uint32_t size;
257        csc->nrelocs += 10;
258
259        size = csc->nrelocs * sizeof(struct radeon_bo*);
260        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
261
262        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
263        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
264
265        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
266    }
267
268    /* Initialize the new relocation. */
269    csc->relocs_bo[csc->crelocs] = NULL;
270    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
271    p_atomic_inc(&bo->num_cs_references);
272    reloc = &csc->relocs[csc->crelocs];
273    reloc->handle = bo->handle;
274    reloc->read_domains = rd;
275    reloc->write_domain = wd;
276    reloc->flags = 0;
277
278    csc->is_handle_added[hash] = TRUE;
279    csc->relocs_hashlist[hash] = reloc;
280    csc->reloc_indices_hashlist[hash] = csc->crelocs;
281
282    csc->chunks[1].length_dw += RELOC_DWORDS;
283
284    *added_domains = rd | wd;
285    return csc->crelocs++;
286}
287
288static void radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
289                                    struct radeon_winsys_cs_handle *buf,
290                                    enum radeon_bo_domain rd,
291                                    enum radeon_bo_domain wd)
292{
293    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
294    struct radeon_bo *bo = (struct radeon_bo*)buf;
295    enum radeon_bo_domain added_domains;
296
297    radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains);
298
299    if (!added_domains)
300        return;
301
302    if (added_domains & RADEON_DOMAIN_GTT)
303        cs->csc->used_gart += bo->size;
304    if (added_domains & RADEON_DOMAIN_VRAM)
305        cs->csc->used_vram += bo->size;
306}
307
308static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
309{
310    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
311    boolean status =
312        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
313        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
314
315    if (status) {
316        cs->csc->validated_crelocs = cs->csc->crelocs;
317    } else {
318        /* Remove lately-added relocations. The validation failed with them
319         * and the CS is about to be flushed because of that. Keep only
320         * the already-validated relocations. */
321        unsigned i;
322
323        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
324            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
325            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
326        }
327        cs->csc->crelocs = cs->csc->validated_crelocs;
328
329        /* Flush if there are any relocs. Clean up otherwise. */
330        if (cs->csc->crelocs) {
331            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
332        } else {
333            radeon_cs_context_cleanup(cs->csc);
334
335            assert(cs->base.cdw == 0);
336            if (cs->base.cdw != 0) {
337                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
338            }
339        }
340    }
341    return status;
342}
343
344static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
345                                      struct radeon_winsys_cs_handle *buf)
346{
347    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
348    struct radeon_bo *bo = (struct radeon_bo*)buf;
349
350    unsigned index = radeon_get_reloc(cs->csc, bo);
351
352    if (index == -1) {
353        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
354        return;
355    }
356
357    OUT_CS(&cs->base, 0xc0001000);
358    OUT_CS(&cs->base, index * RELOC_DWORDS);
359}
360
361static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
362{
363    struct radeon_cs_context *csc = (struct radeon_cs_context*)param;
364    unsigned i;
365
366    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
367                            &csc->cs, sizeof(struct drm_radeon_cs))) {
368        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
369            unsigned i;
370
371            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
372            for (i = 0; i < csc->chunks[0].length_dw; i++) {
373                fprintf(stderr, "0x%08X\n", csc->buf[i]);
374            }
375        } else {
376            fprintf(stderr, "radeon: The kernel rejected CS, "
377                    "see dmesg for more information.\n");
378        }
379    }
380
381    for (i = 0; i < csc->crelocs; i++)
382        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
383
384    radeon_cs_context_cleanup(csc);
385    return NULL;
386}
387
388void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
389{
390    /* Wait for any pending ioctl to complete. */
391    if (cs->thread) {
392        pipe_thread_wait(cs->thread);
393        cs->thread = 0;
394    }
395}
396
397DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
398
399static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
400{
401    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
402    struct radeon_cs_context *tmp;
403
404    radeon_drm_cs_sync_flush(cs);
405
406    /* If the CS is not empty, emit it in a newly-spawned thread. */
407    if (cs->base.cdw) {
408        unsigned i, crelocs = cs->csc->crelocs;
409
410        cs->csc->chunks[0].length_dw = cs->base.cdw;
411
412        for (i = 0; i < crelocs; i++) {
413            /* Update the number of active asynchronous CS ioctls for the buffer. */
414            p_atomic_inc(&cs->csc->relocs_bo[i]->num_active_ioctls);
415
416            /* Update whether the buffer is busy for write. */
417            if (cs->csc->relocs[i].write_domain) {
418                cs->csc->relocs_bo[i]->busy_for_write = TRUE;
419            }
420        }
421
422        if (cs->ws->num_cpus > 1 && debug_get_option_thread() &&
423            (flags & RADEON_FLUSH_ASYNC)) {
424            cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs->csc);
425            assert(cs->thread);
426        } else {
427            radeon_drm_cs_emit_ioctl(cs->csc);
428        }
429    } else {
430        radeon_cs_context_cleanup(cs->csc);
431    }
432
433    /* Flip command streams. */
434    tmp = cs->csc;
435    cs->csc = cs->cst;
436    cs->cst = tmp;
437
438    /* Prepare a new CS. */
439    cs->base.buf = cs->csc->buf;
440    cs->base.cdw = 0;
441}
442
443static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
444{
445    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
446    radeon_drm_cs_sync_flush(cs);
447    radeon_cs_context_cleanup(&cs->csc1);
448    radeon_cs_context_cleanup(&cs->csc2);
449    p_atomic_dec(&cs->ws->num_cs);
450    radeon_destroy_cs_context(&cs->csc1);
451    radeon_destroy_cs_context(&cs->csc2);
452    FREE(cs);
453}
454
455static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
456                                    void (*flush)(void *ctx, unsigned flags),
457                                    void *user)
458{
459    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
460    cs->flush_cs = flush;
461    cs->flush_data = user;
462}
463
464static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
465                                       struct radeon_winsys_cs_handle *_buf)
466{
467    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
468    struct radeon_bo *bo = (struct radeon_bo*)_buf;
469
470    return radeon_bo_is_referenced_by_cs(cs, bo);
471}
472
473static unsigned trans_add_reloc(struct radeon_winsys_cs *rcs,
474				struct radeon_winsys_cs_handle *buf,
475				enum radeon_bo_domain rd,
476                                enum radeon_bo_domain wd)
477{
478	struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
479        struct radeon_bo *bo = (struct radeon_bo*)buf;
480        enum radeon_bo_domain added_domains;
481
482        unsigned index = radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains);
483
484        if (added_domains & RADEON_DOMAIN_GTT)
485            cs->csc->used_gart += bo->size;
486        if (added_domains & RADEON_DOMAIN_VRAM)
487            cs->csc->used_vram += bo->size;
488
489	return index;
490}
491
492void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
493{
494    ws->base.cs_create = radeon_drm_cs_create;
495    ws->base.cs_destroy = radeon_drm_cs_destroy;
496    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
497    ws->base.cs_validate = radeon_drm_cs_validate;
498    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
499    ws->base.cs_flush = radeon_drm_cs_flush;
500    ws->base.cs_set_flush = radeon_drm_cs_set_flush;
501    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
502
503    ws->base.trans_add_reloc = trans_add_reloc;
504}
505