radeon_drm_cs.c revision 64ab39b035f755510a644643b96451431bbe5f27
1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
75
76static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd)
77{
78    csc->fd = fd;
79    csc->nrelocs = 512;
80    csc->relocs_bo = (struct radeon_bo**)
81                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
82    if (!csc->relocs_bo) {
83        return FALSE;
84    }
85
86    csc->relocs = (struct drm_radeon_cs_reloc*)
87                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
88    if (!csc->relocs) {
89        FREE(csc->relocs_bo);
90        return FALSE;
91    }
92
93    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
94    csc->chunks[0].length_dw = 0;
95    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
96    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
97    csc->chunks[1].length_dw = 0;
98    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
99
100    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
101    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
102
103    csc->cs.num_chunks = 2;
104    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
105    return TRUE;
106}
107
108static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
109{
110    unsigned i;
111
112    for (i = 0; i < csc->crelocs; i++) {
113        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
114        radeon_bo_reference(&csc->relocs_bo[i], NULL);
115    }
116
117    csc->crelocs = 0;
118    csc->validated_crelocs = 0;
119    csc->chunks[0].length_dw = 0;
120    csc->chunks[1].length_dw = 0;
121    csc->used_gart = 0;
122    csc->used_vram = 0;
123    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
124}
125
126static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
127{
128    radeon_cs_context_cleanup(csc);
129    FREE(csc->relocs_bo);
130    FREE(csc->relocs);
131}
132
133static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
134{
135    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
136    struct radeon_drm_cs *cs;
137
138    cs = CALLOC_STRUCT(radeon_drm_cs);
139    if (!cs) {
140        return NULL;
141    }
142
143    cs->ws = ws;
144
145    if (!radeon_init_cs_context(&cs->csc1, cs->ws->fd)) {
146        FREE(cs);
147        return NULL;
148    }
149    if (!radeon_init_cs_context(&cs->csc2, cs->ws->fd)) {
150        radeon_destroy_cs_context(&cs->csc1);
151        FREE(cs);
152        return NULL;
153    }
154
155    /* Set the first command buffer as current. */
156    cs->csc = &cs->csc1;
157    cs->cst = &cs->csc2;
158    cs->base.buf = cs->csc->buf;
159
160    p_atomic_inc(&ws->num_cs);
161    return &cs->base;
162}
163
164#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
165
166static INLINE void update_domains(struct drm_radeon_cs_reloc *reloc,
167                                  enum radeon_bo_domain rd,
168                                  enum radeon_bo_domain wd,
169                                  enum radeon_bo_domain *added_domains)
170{
171    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
172
173    if (reloc->read_domains & wd) {
174        reloc->read_domains = rd;
175        reloc->write_domain = wd;
176    } else if (rd & reloc->write_domain) {
177        reloc->read_domains = rd;
178        reloc->write_domain |= wd;
179    } else {
180        reloc->read_domains |= rd;
181        reloc->write_domain |= wd;
182    }
183}
184
185int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
186{
187    struct drm_radeon_cs_reloc *reloc;
188    unsigned i;
189    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
190
191    if (csc->is_handle_added[hash]) {
192        reloc = csc->relocs_hashlist[hash];
193        if (reloc->handle == bo->handle) {
194            return csc->reloc_indices_hashlist[hash];
195        }
196
197        /* Hash collision, look for the BO in the list of relocs linearly. */
198        for (i = csc->crelocs; i != 0;) {
199            --i;
200            reloc = &csc->relocs[i];
201            if (reloc->handle == bo->handle) {
202                /* Put this reloc in the hash list.
203                 * This will prevent additional hash collisions if there are
204                 * several subsequent get_reloc calls of the same buffer.
205                 *
206                 * Example: Assuming buffers A,B,C collide in the hash list,
207                 * the following sequence of relocs:
208                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
209                 * will collide here: ^ and here:   ^,
210                 * meaning that we should get very few collisions in the end. */
211                csc->relocs_hashlist[hash] = reloc;
212                csc->reloc_indices_hashlist[hash] = i;
213                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
214                return i;
215            }
216        }
217    }
218
219    return -1;
220}
221
222static void radeon_add_reloc(struct radeon_cs_context *csc,
223                             struct radeon_bo *bo,
224                             enum radeon_bo_domain rd,
225                             enum radeon_bo_domain wd,
226                             enum radeon_bo_domain *added_domains)
227{
228    struct drm_radeon_cs_reloc *reloc;
229    unsigned i;
230    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
231
232    if (csc->is_handle_added[hash]) {
233        reloc = csc->relocs_hashlist[hash];
234        if (reloc->handle == bo->handle) {
235            update_domains(reloc, rd, wd, added_domains);
236            return;
237        }
238
239        /* Hash collision, look for the BO in the list of relocs linearly. */
240        for (i = csc->crelocs; i != 0;) {
241            --i;
242            reloc = &csc->relocs[i];
243            if (reloc->handle == bo->handle) {
244                update_domains(reloc, rd, wd, added_domains);
245
246                csc->relocs_hashlist[hash] = reloc;
247                csc->reloc_indices_hashlist[hash] = i;
248                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
249                return;
250            }
251        }
252    }
253
254    /* New relocation, check if the backing array is large enough. */
255    if (csc->crelocs >= csc->nrelocs) {
256        uint32_t size;
257        csc->nrelocs += 10;
258
259        size = csc->nrelocs * sizeof(struct radeon_bo*);
260        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
261
262        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
263        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
264
265        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
266    }
267
268    /* Initialize the new relocation. */
269    csc->relocs_bo[csc->crelocs] = NULL;
270    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
271    p_atomic_inc(&bo->num_cs_references);
272    reloc = &csc->relocs[csc->crelocs];
273    reloc->handle = bo->handle;
274    reloc->read_domains = rd;
275    reloc->write_domain = wd;
276    reloc->flags = 0;
277
278    csc->is_handle_added[hash] = TRUE;
279    csc->relocs_hashlist[hash] = reloc;
280    csc->reloc_indices_hashlist[hash] = csc->crelocs;
281
282    csc->chunks[1].length_dw += RELOC_DWORDS;
283    csc->crelocs++;
284
285    *added_domains = rd | wd;
286}
287
288static void radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
289                                    struct radeon_winsys_cs_handle *buf,
290                                    enum radeon_bo_domain rd,
291                                    enum radeon_bo_domain wd)
292{
293    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
294    struct radeon_bo *bo = (struct radeon_bo*)buf;
295    enum radeon_bo_domain added_domains;
296
297    radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains);
298
299    if (!added_domains)
300        return;
301
302    if (added_domains & RADEON_DOMAIN_GTT)
303        cs->csc->used_gart += bo->size;
304    if (added_domains & RADEON_DOMAIN_VRAM)
305        cs->csc->used_vram += bo->size;
306}
307
308static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
309{
310    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
311    boolean status =
312        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
313        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
314
315    if (status) {
316        cs->csc->validated_crelocs = cs->csc->crelocs;
317    } else {
318        /* Remove lately-added relocations. The validation failed with them
319         * and the CS is about to be flushed because of that. Keep only
320         * the already-validated relocations. */
321        unsigned i;
322
323        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
324            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
325            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
326        }
327        cs->csc->crelocs = cs->csc->validated_crelocs;
328
329        /* Flush if there are any relocs. Clean up otherwise. */
330        if (cs->csc->crelocs) {
331            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
332        } else {
333            radeon_cs_context_cleanup(cs->csc);
334
335            assert(cs->base.cdw == 0);
336            if (cs->base.cdw != 0) {
337                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
338            }
339        }
340    }
341    return status;
342}
343
344static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
345                                      struct radeon_winsys_cs_handle *buf)
346{
347    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
348    struct radeon_bo *bo = (struct radeon_bo*)buf;
349
350    unsigned index = radeon_get_reloc(cs->csc, bo);
351
352    if (index == -1) {
353        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
354        return;
355    }
356
357    OUT_CS(&cs->base, 0xc0001000);
358    OUT_CS(&cs->base, index * RELOC_DWORDS);
359}
360
361static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
362{
363    struct radeon_cs_context *csc = (struct radeon_cs_context*)param;
364    unsigned i;
365
366    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
367                            &csc->cs, sizeof(struct drm_radeon_cs))) {
368        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
369            unsigned i;
370
371            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
372            for (i = 0; i < csc->chunks[0].length_dw; i++) {
373                fprintf(stderr, "0x%08X\n", csc->buf[i]);
374            }
375        } else {
376            fprintf(stderr, "radeon: The kernel rejected CS, "
377                    "see dmesg for more information.\n");
378        }
379    }
380
381    for (i = 0; i < csc->crelocs; i++)
382        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
383    return NULL;
384}
385
386void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
387{
388    /* Wait for any pending ioctl to complete. */
389    if (cs->thread) {
390        pipe_thread_wait(cs->thread);
391        cs->thread = 0;
392    }
393}
394
395DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
396
397static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
398{
399    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
400    struct radeon_cs_context *tmp;
401
402    radeon_drm_cs_sync_flush(cs);
403
404    /* If the CS is not empty, emit it in a newly-spawned thread. */
405    if (cs->base.cdw) {
406        unsigned i, crelocs = cs->csc->crelocs;
407
408        cs->csc->chunks[0].length_dw = cs->base.cdw;
409
410        for (i = 0; i < crelocs; i++) {
411            /* Update the number of active asynchronous CS ioctls for the buffer. */
412            p_atomic_inc(&cs->csc->relocs_bo[i]->num_active_ioctls);
413
414            /* Update whether the buffer is busy for write. */
415            if (cs->csc->relocs[i].write_domain) {
416                cs->csc->relocs_bo[i]->busy_for_write = TRUE;
417            }
418        }
419
420        if (cs->ws->num_cpus > 1 && debug_get_option_thread() &&
421            (flags & RADEON_FLUSH_ASYNC)) {
422            cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs->csc);
423            assert(cs->thread);
424        } else {
425            radeon_drm_cs_emit_ioctl(cs->csc);
426        }
427    }
428
429    /* Flip command streams. */
430    tmp = cs->csc;
431    cs->csc = cs->cst;
432    cs->cst = tmp;
433
434    /* Prepare a new CS. */
435    radeon_cs_context_cleanup(cs->csc);
436
437    cs->base.buf = cs->csc->buf;
438    cs->base.cdw = 0;
439}
440
441static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
442{
443    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
444    radeon_drm_cs_sync_flush(cs);
445    radeon_cs_context_cleanup(&cs->csc1);
446    radeon_cs_context_cleanup(&cs->csc2);
447    p_atomic_dec(&cs->ws->num_cs);
448    radeon_destroy_cs_context(&cs->csc1);
449    radeon_destroy_cs_context(&cs->csc2);
450    FREE(cs);
451}
452
453static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
454                                    void (*flush)(void *ctx, unsigned flags),
455                                    void *user)
456{
457    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
458    cs->flush_cs = flush;
459    cs->flush_data = user;
460}
461
462static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
463                                       struct radeon_winsys_cs_handle *_buf)
464{
465    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
466    struct radeon_bo *bo = (struct radeon_bo*)_buf;
467
468    return radeon_bo_is_referenced_by_cs(cs, bo);
469}
470
471void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
472{
473    ws->base.cs_create = radeon_drm_cs_create;
474    ws->base.cs_destroy = radeon_drm_cs_destroy;
475    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
476    ws->base.cs_validate = radeon_drm_cs_validate;
477    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
478    ws->base.cs_flush = radeon_drm_cs_flush;
479    ws->base.cs_set_flush = radeon_drm_cs_set_flush;
480    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
481}
482