radeon_drm_cs.c revision f6a66a33f7ba52e36242db9346573bf477efa04e
1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74/*
75 * this are copy from radeon_drm, once an updated libdrm is released
76 * we should bump configure.ac requirement for it and remove the following
77 * field
78 */
79#ifndef RADEON_CHUNK_ID_FLAGS
80#define RADEON_CHUNK_ID_FLAGS       0x03
81
82/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */
83#define RADEON_CS_KEEP_TILING_FLAGS 0x01
84
85
86#endif
87
88#ifndef RADEON_CS_USE_VM
89#define RADEON_CS_USE_VM            0x02
90/* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */
91#define RADEON_CS_RING_GFX          0
92#define RADEON_CS_RING_COMPUTE      1
93#endif
94
95
96#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
97
98static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
99                                      struct radeon_drm_winsys *ws)
100{
101    csc->fd = ws->fd;
102    csc->nrelocs = 512;
103    csc->relocs_bo = (struct radeon_bo**)
104                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
105    if (!csc->relocs_bo) {
106        return FALSE;
107    }
108
109    csc->relocs = (struct drm_radeon_cs_reloc*)
110                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
111    if (!csc->relocs) {
112        FREE(csc->relocs_bo);
113        return FALSE;
114    }
115
116    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117    csc->chunks[0].length_dw = 0;
118    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120    csc->chunks[1].length_dw = 0;
121    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123    csc->chunks[2].length_dw = 2;
124    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125
126    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129
130    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131    return TRUE;
132}
133
134static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
135{
136    unsigned i;
137
138    for (i = 0; i < csc->crelocs; i++) {
139        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
140        radeon_bo_reference(&csc->relocs_bo[i], NULL);
141    }
142
143    csc->crelocs = 0;
144    csc->validated_crelocs = 0;
145    csc->chunks[0].length_dw = 0;
146    csc->chunks[1].length_dw = 0;
147    csc->used_gart = 0;
148    csc->used_vram = 0;
149    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
150}
151
152static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
153{
154    radeon_cs_context_cleanup(csc);
155    FREE(csc->relocs_bo);
156    FREE(csc->relocs);
157}
158
159DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
160static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
161
162static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
163{
164    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
165    struct radeon_drm_cs *cs;
166
167    cs = CALLOC_STRUCT(radeon_drm_cs);
168    if (!cs) {
169        return NULL;
170    }
171    pipe_semaphore_init(&cs->flush_queued, 0);
172    pipe_semaphore_init(&cs->flush_completed, 0);
173
174    cs->ws = ws;
175
176    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
177        FREE(cs);
178        return NULL;
179    }
180    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
181        radeon_destroy_cs_context(&cs->csc1);
182        FREE(cs);
183        return NULL;
184    }
185
186    /* Set the first command buffer as current. */
187    cs->csc = &cs->csc1;
188    cs->cst = &cs->csc2;
189    cs->base.buf = cs->csc->buf;
190
191    p_atomic_inc(&ws->num_cs);
192    if (cs->ws->num_cpus > 1 && debug_get_option_thread())
193        cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
194    return &cs->base;
195}
196
197#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
198
199static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
200                                        enum radeon_bo_domain rd,
201                                        enum radeon_bo_domain wd,
202                                        enum radeon_bo_domain *added_domains)
203{
204    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
205
206    reloc->read_domains |= rd;
207    reloc->write_domain |= wd;
208}
209
210int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
211{
212    struct drm_radeon_cs_reloc *reloc;
213    unsigned i;
214    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
215
216    if (csc->is_handle_added[hash]) {
217        i = csc->reloc_indices_hashlist[hash];
218        reloc = &csc->relocs[i];
219        if (reloc->handle == bo->handle) {
220            return i;
221        }
222
223        /* Hash collision, look for the BO in the list of relocs linearly. */
224        for (i = csc->crelocs; i != 0;) {
225            --i;
226            reloc = &csc->relocs[i];
227            if (reloc->handle == bo->handle) {
228                /* Put this reloc in the hash list.
229                 * This will prevent additional hash collisions if there are
230                 * several consecutive get_reloc calls for the same buffer.
231                 *
232                 * Example: Assuming buffers A,B,C collide in the hash list,
233                 * the following sequence of relocs:
234                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
235                 * will collide here: ^ and here:   ^,
236                 * meaning that we should get very few collisions in the end. */
237                csc->reloc_indices_hashlist[hash] = i;
238                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
239                return i;
240            }
241        }
242    }
243
244    return -1;
245}
246
247static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
248                                 struct radeon_bo *bo,
249                                 enum radeon_bo_usage usage,
250                                 enum radeon_bo_domain domains,
251                                 enum radeon_bo_domain *added_domains)
252{
253    struct drm_radeon_cs_reloc *reloc;
254    unsigned i;
255    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
256    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
257    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
258
259    if (csc->is_handle_added[hash]) {
260        i = csc->reloc_indices_hashlist[hash];
261        reloc = &csc->relocs[i];
262        if (reloc->handle == bo->handle) {
263            update_reloc_domains(reloc, rd, wd, added_domains);
264            return i;
265        }
266
267        /* Hash collision, look for the BO in the list of relocs linearly. */
268        for (i = csc->crelocs; i != 0;) {
269            --i;
270            reloc = &csc->relocs[i];
271            if (reloc->handle == bo->handle) {
272                update_reloc_domains(reloc, rd, wd, added_domains);
273
274                csc->reloc_indices_hashlist[hash] = i;
275                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
276                return i;
277            }
278        }
279    }
280
281    /* New relocation, check if the backing array is large enough. */
282    if (csc->crelocs >= csc->nrelocs) {
283        uint32_t size;
284        csc->nrelocs += 10;
285
286        size = csc->nrelocs * sizeof(struct radeon_bo*);
287        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
288
289        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
290        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
291
292        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
293    }
294
295    /* Initialize the new relocation. */
296    csc->relocs_bo[csc->crelocs] = NULL;
297    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
298    p_atomic_inc(&bo->num_cs_references);
299    reloc = &csc->relocs[csc->crelocs];
300    reloc->handle = bo->handle;
301    reloc->read_domains = rd;
302    reloc->write_domain = wd;
303    reloc->flags = 0;
304
305    csc->is_handle_added[hash] = TRUE;
306    csc->reloc_indices_hashlist[hash] = csc->crelocs;
307
308    csc->chunks[1].length_dw += RELOC_DWORDS;
309
310    *added_domains = rd | wd;
311    return csc->crelocs++;
312}
313
314static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
315                                        struct radeon_winsys_cs_handle *buf,
316                                        enum radeon_bo_usage usage,
317                                        enum radeon_bo_domain domains)
318{
319    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
320    struct radeon_bo *bo = (struct radeon_bo*)buf;
321    enum radeon_bo_domain added_domains;
322
323    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
324
325    if (added_domains & RADEON_DOMAIN_GTT)
326        cs->csc->used_gart += bo->base.size;
327    if (added_domains & RADEON_DOMAIN_VRAM)
328        cs->csc->used_vram += bo->base.size;
329
330    return index;
331}
332
333static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
334{
335    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
336    boolean status =
337        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
338        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
339
340    if (status) {
341        cs->csc->validated_crelocs = cs->csc->crelocs;
342    } else {
343        /* Remove lately-added relocations. The validation failed with them
344         * and the CS is about to be flushed because of that. Keep only
345         * the already-validated relocations. */
346        unsigned i;
347
348        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
349            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
350            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
351        }
352        cs->csc->crelocs = cs->csc->validated_crelocs;
353
354        /* Flush if there are any relocs. Clean up otherwise. */
355        if (cs->csc->crelocs) {
356            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
357        } else {
358            radeon_cs_context_cleanup(cs->csc);
359
360            assert(cs->base.cdw == 0);
361            if (cs->base.cdw != 0) {
362                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
363            }
364        }
365    }
366    return status;
367}
368
369static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
370                                      struct radeon_winsys_cs_handle *buf)
371{
372    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
373    struct radeon_bo *bo = (struct radeon_bo*)buf;
374
375    unsigned index = radeon_get_reloc(cs->csc, bo);
376
377    if (index == -1) {
378        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
379        return;
380    }
381
382    OUT_CS(&cs->base, 0xc0001000);
383    OUT_CS(&cs->base, index * RELOC_DWORDS);
384}
385
386static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
387{
388    unsigned i;
389
390    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
391                            &csc->cs, sizeof(struct drm_radeon_cs))) {
392        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
393            unsigned i;
394
395            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
396            for (i = 0; i < csc->chunks[0].length_dw; i++) {
397                fprintf(stderr, "0x%08X\n", csc->buf[i]);
398            }
399        } else {
400            fprintf(stderr, "radeon: The kernel rejected CS, "
401                    "see dmesg for more information.\n");
402        }
403    }
404
405    for (i = 0; i < csc->crelocs; i++)
406        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
407
408    radeon_cs_context_cleanup(csc);
409}
410
411static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
412{
413    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
414
415    while (1) {
416        pipe_semaphore_wait(&cs->flush_queued);
417        if (cs->kill_thread)
418            break;
419        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
420        pipe_semaphore_signal(&cs->flush_completed);
421    }
422    pipe_semaphore_signal(&cs->flush_completed);
423    return NULL;
424}
425
426void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
427{
428    /* Wait for any pending ioctl to complete. */
429    if (cs->thread && cs->flush_started) {
430        pipe_semaphore_wait(&cs->flush_completed);
431        cs->flush_started = 0;
432    }
433}
434
435static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
436{
437    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
438    struct radeon_cs_context *tmp;
439
440    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
441       fprintf(stderr, "radeon: command stream overflowed\n");
442    }
443
444    radeon_drm_cs_sync_flush(cs);
445
446    /* Flip command streams. */
447    tmp = cs->csc;
448    cs->csc = cs->cst;
449    cs->cst = tmp;
450
451    /* If the CS is not empty or overflowed, emit it in a separate thread. */
452    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS) {
453        unsigned i, crelocs = cs->cst->crelocs;
454
455        cs->cst->chunks[0].length_dw = cs->base.cdw;
456
457        for (i = 0; i < crelocs; i++) {
458            /* Update the number of active asynchronous CS ioctls for the buffer. */
459            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
460        }
461
462        cs->cst->flags[0] = 0;
463        cs->cst->flags[1] = RADEON_CS_RING_GFX;
464        cs->cst->cs.num_chunks = 2;
465        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
466            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
467            cs->cst->cs.num_chunks = 3;
468        }
469        if (cs->ws->info.r600_virtual_address) {
470            cs->cst->flags[0] |= RADEON_CS_USE_VM;
471            cs->cst->cs.num_chunks = 3;
472        }
473        if (flags & RADEON_FLUSH_COMPUTE) {
474            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
475            cs->cst->cs.num_chunks = 3;
476        }
477
478        if (cs->thread &&
479            (flags & RADEON_FLUSH_ASYNC)) {
480            cs->flush_started = 1;
481            pipe_semaphore_signal(&cs->flush_queued);
482        } else {
483            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
484        }
485    } else {
486        radeon_cs_context_cleanup(cs->cst);
487    }
488
489    /* Prepare a new CS. */
490    cs->base.buf = cs->csc->buf;
491    cs->base.cdw = 0;
492}
493
494static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
495{
496    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
497    radeon_drm_cs_sync_flush(cs);
498    if (cs->thread) {
499        cs->kill_thread = 1;
500        pipe_semaphore_signal(&cs->flush_queued);
501        pipe_semaphore_wait(&cs->flush_completed);
502        pipe_thread_wait(cs->thread);
503    }
504    pipe_semaphore_destroy(&cs->flush_queued);
505    pipe_semaphore_destroy(&cs->flush_completed);
506    radeon_cs_context_cleanup(&cs->csc1);
507    radeon_cs_context_cleanup(&cs->csc2);
508    p_atomic_dec(&cs->ws->num_cs);
509    radeon_destroy_cs_context(&cs->csc1);
510    radeon_destroy_cs_context(&cs->csc2);
511    FREE(cs);
512}
513
514static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
515                                    void (*flush)(void *ctx, unsigned flags),
516                                    void *user)
517{
518    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
519    cs->flush_cs = flush;
520    cs->flush_data = user;
521}
522
523static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
524                                       struct radeon_winsys_cs_handle *_buf,
525                                       enum radeon_bo_usage usage)
526{
527    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
528    struct radeon_bo *bo = (struct radeon_bo*)_buf;
529    int index;
530
531    if (!bo->num_cs_references)
532        return FALSE;
533
534    index = radeon_get_reloc(cs->csc, bo);
535    if (index == -1)
536        return FALSE;
537
538    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
539        return TRUE;
540    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
541        return TRUE;
542
543    return FALSE;
544}
545
546void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
547{
548    ws->base.cs_create = radeon_drm_cs_create;
549    ws->base.cs_destroy = radeon_drm_cs_destroy;
550    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
551    ws->base.cs_validate = radeon_drm_cs_validate;
552    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
553    ws->base.cs_flush = radeon_drm_cs_flush;
554    ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
555    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
556}
557