radeon_drm_cs.c revision 12aeb47b6af4b3100da26b3ab72ef93886479219
1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74/*
75 * this are copy from radeon_drm, once an updated libdrm is released
76 * we should bump configure.ac requirement for it and remove the following
77 * field
78 */
79#ifndef RADEON_CHUNK_ID_FLAGS
80#define RADEON_CHUNK_ID_FLAGS       0x03
81
82/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */
83#define RADEON_CS_KEEP_TILING_FLAGS 0x01
84#endif
85
86#ifndef RADEON_CS_USE_VM
87#define RADEON_CS_USE_VM            0x02
88/* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */
89#define RADEON_CS_RING_GFX          0
90#define RADEON_CS_RING_COMPUTE      1
91#endif
92
93#ifndef RADEON_CS_END_OF_FRAME
94#define RADEON_CS_END_OF_FRAME      0x04
95#endif
96
97
98#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
99
100static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
101                                      struct radeon_drm_winsys *ws)
102{
103    csc->fd = ws->fd;
104    csc->nrelocs = 512;
105    csc->relocs_bo = (struct radeon_bo**)
106                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
107    if (!csc->relocs_bo) {
108        return FALSE;
109    }
110
111    csc->relocs = (struct drm_radeon_cs_reloc*)
112                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
113    if (!csc->relocs) {
114        FREE(csc->relocs_bo);
115        return FALSE;
116    }
117
118    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
119    csc->chunks[0].length_dw = 0;
120    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
121    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
122    csc->chunks[1].length_dw = 0;
123    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
124    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
125    csc->chunks[2].length_dw = 2;
126    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
127
128    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
129    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
130    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
131
132    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
133    return TRUE;
134}
135
136static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
137{
138    unsigned i;
139
140    for (i = 0; i < csc->crelocs; i++) {
141        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
142        radeon_bo_reference(&csc->relocs_bo[i], NULL);
143    }
144
145    csc->crelocs = 0;
146    csc->validated_crelocs = 0;
147    csc->chunks[0].length_dw = 0;
148    csc->chunks[1].length_dw = 0;
149    csc->used_gart = 0;
150    csc->used_vram = 0;
151    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
152}
153
154static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
155{
156    radeon_cs_context_cleanup(csc);
157    FREE(csc->relocs_bo);
158    FREE(csc->relocs);
159}
160
161DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
162static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
163
164static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
165{
166    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
167    struct radeon_drm_cs *cs;
168
169    cs = CALLOC_STRUCT(radeon_drm_cs);
170    if (!cs) {
171        return NULL;
172    }
173    pipe_semaphore_init(&cs->flush_queued, 0);
174    pipe_semaphore_init(&cs->flush_completed, 0);
175
176    cs->ws = ws;
177
178    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
179        FREE(cs);
180        return NULL;
181    }
182    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
183        radeon_destroy_cs_context(&cs->csc1);
184        FREE(cs);
185        return NULL;
186    }
187
188    /* Set the first command buffer as current. */
189    cs->csc = &cs->csc1;
190    cs->cst = &cs->csc2;
191    cs->base.buf = cs->csc->buf;
192
193    p_atomic_inc(&ws->num_cs);
194    if (cs->ws->num_cpus > 1 && debug_get_option_thread())
195        cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
196    return &cs->base;
197}
198
199#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
200
201static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
202                                        enum radeon_bo_domain rd,
203                                        enum radeon_bo_domain wd,
204                                        enum radeon_bo_domain *added_domains)
205{
206    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
207
208    reloc->read_domains |= rd;
209    reloc->write_domain |= wd;
210}
211
212int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
213{
214    struct drm_radeon_cs_reloc *reloc;
215    unsigned i;
216    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
217
218    if (csc->is_handle_added[hash]) {
219        i = csc->reloc_indices_hashlist[hash];
220        reloc = &csc->relocs[i];
221        if (reloc->handle == bo->handle) {
222            return i;
223        }
224
225        /* Hash collision, look for the BO in the list of relocs linearly. */
226        for (i = csc->crelocs; i != 0;) {
227            --i;
228            reloc = &csc->relocs[i];
229            if (reloc->handle == bo->handle) {
230                /* Put this reloc in the hash list.
231                 * This will prevent additional hash collisions if there are
232                 * several consecutive get_reloc calls for the same buffer.
233                 *
234                 * Example: Assuming buffers A,B,C collide in the hash list,
235                 * the following sequence of relocs:
236                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
237                 * will collide here: ^ and here:   ^,
238                 * meaning that we should get very few collisions in the end. */
239                csc->reloc_indices_hashlist[hash] = i;
240                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
241                return i;
242            }
243        }
244    }
245
246    return -1;
247}
248
249static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
250                                 struct radeon_bo *bo,
251                                 enum radeon_bo_usage usage,
252                                 enum radeon_bo_domain domains,
253                                 enum radeon_bo_domain *added_domains)
254{
255    struct drm_radeon_cs_reloc *reloc;
256    unsigned i;
257    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
258    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
259    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
260
261    if (csc->is_handle_added[hash]) {
262        i = csc->reloc_indices_hashlist[hash];
263        reloc = &csc->relocs[i];
264        if (reloc->handle == bo->handle) {
265            update_reloc_domains(reloc, rd, wd, added_domains);
266            return i;
267        }
268
269        /* Hash collision, look for the BO in the list of relocs linearly. */
270        for (i = csc->crelocs; i != 0;) {
271            --i;
272            reloc = &csc->relocs[i];
273            if (reloc->handle == bo->handle) {
274                update_reloc_domains(reloc, rd, wd, added_domains);
275
276                csc->reloc_indices_hashlist[hash] = i;
277                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
278                return i;
279            }
280        }
281    }
282
283    /* New relocation, check if the backing array is large enough. */
284    if (csc->crelocs >= csc->nrelocs) {
285        uint32_t size;
286        csc->nrelocs += 10;
287
288        size = csc->nrelocs * sizeof(struct radeon_bo*);
289        csc->relocs_bo = realloc(csc->relocs_bo, size);
290
291        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
292        csc->relocs = realloc(csc->relocs, size);
293
294        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
295    }
296
297    /* Initialize the new relocation. */
298    csc->relocs_bo[csc->crelocs] = NULL;
299    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
300    p_atomic_inc(&bo->num_cs_references);
301    reloc = &csc->relocs[csc->crelocs];
302    reloc->handle = bo->handle;
303    reloc->read_domains = rd;
304    reloc->write_domain = wd;
305    reloc->flags = 0;
306
307    csc->is_handle_added[hash] = TRUE;
308    csc->reloc_indices_hashlist[hash] = csc->crelocs;
309
310    csc->chunks[1].length_dw += RELOC_DWORDS;
311
312    *added_domains = rd | wd;
313    return csc->crelocs++;
314}
315
316static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
317                                        struct radeon_winsys_cs_handle *buf,
318                                        enum radeon_bo_usage usage,
319                                        enum radeon_bo_domain domains)
320{
321    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
322    struct radeon_bo *bo = (struct radeon_bo*)buf;
323    enum radeon_bo_domain added_domains;
324
325    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
326
327    if (added_domains & RADEON_DOMAIN_GTT)
328        cs->csc->used_gart += bo->base.size;
329    if (added_domains & RADEON_DOMAIN_VRAM)
330        cs->csc->used_vram += bo->base.size;
331
332    return index;
333}
334
335static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
336{
337    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
338    boolean status =
339        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
340        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
341
342    if (status) {
343        cs->csc->validated_crelocs = cs->csc->crelocs;
344    } else {
345        /* Remove lately-added relocations. The validation failed with them
346         * and the CS is about to be flushed because of that. Keep only
347         * the already-validated relocations. */
348        unsigned i;
349
350        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
351            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
352            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
353        }
354        cs->csc->crelocs = cs->csc->validated_crelocs;
355
356        /* Flush if there are any relocs. Clean up otherwise. */
357        if (cs->csc->crelocs) {
358            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
359        } else {
360            radeon_cs_context_cleanup(cs->csc);
361
362            assert(cs->base.cdw == 0);
363            if (cs->base.cdw != 0) {
364                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
365            }
366        }
367    }
368    return status;
369}
370
371static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
372                                      struct radeon_winsys_cs_handle *buf)
373{
374    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
375    struct radeon_bo *bo = (struct radeon_bo*)buf;
376
377    unsigned index = radeon_get_reloc(cs->csc, bo);
378
379    if (index == -1) {
380        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
381        return;
382    }
383
384    OUT_CS(&cs->base, 0xc0001000);
385    OUT_CS(&cs->base, index * RELOC_DWORDS);
386}
387
388static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
389{
390    unsigned i;
391
392    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
393                            &csc->cs, sizeof(struct drm_radeon_cs))) {
394        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
395            unsigned i;
396
397            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
398            for (i = 0; i < csc->chunks[0].length_dw; i++) {
399                fprintf(stderr, "0x%08X\n", csc->buf[i]);
400            }
401        } else {
402            fprintf(stderr, "radeon: The kernel rejected CS, "
403                    "see dmesg for more information.\n");
404        }
405    }
406
407    for (i = 0; i < csc->crelocs; i++)
408        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
409
410    radeon_cs_context_cleanup(csc);
411}
412
413static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
414{
415    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
416
417    while (1) {
418        pipe_semaphore_wait(&cs->flush_queued);
419        if (cs->kill_thread)
420            break;
421        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
422        pipe_semaphore_signal(&cs->flush_completed);
423    }
424    pipe_semaphore_signal(&cs->flush_completed);
425    return NULL;
426}
427
428void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
429{
430    /* Wait for any pending ioctl to complete. */
431    if (cs->thread && cs->flush_started) {
432        pipe_semaphore_wait(&cs->flush_completed);
433        cs->flush_started = 0;
434    }
435}
436
437DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
438
439static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
440{
441    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
442    struct radeon_cs_context *tmp;
443
444    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
445       fprintf(stderr, "radeon: command stream overflowed\n");
446    }
447
448    radeon_drm_cs_sync_flush(cs);
449
450    /* Flip command streams. */
451    tmp = cs->csc;
452    cs->csc = cs->cst;
453    cs->cst = tmp;
454
455    /* If the CS is not empty or overflowed, emit it in a separate thread. */
456    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS &&
457	!debug_get_option_noop()) {
458        unsigned i, crelocs = cs->cst->crelocs;
459
460        cs->cst->chunks[0].length_dw = cs->base.cdw;
461
462        for (i = 0; i < crelocs; i++) {
463            /* Update the number of active asynchronous CS ioctls for the buffer. */
464            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
465        }
466
467        cs->cst->flags[0] = 0;
468        cs->cst->flags[1] = RADEON_CS_RING_GFX;
469        cs->cst->cs.num_chunks = 2;
470        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
471            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
472            cs->cst->cs.num_chunks = 3;
473        }
474        if (cs->ws->info.r600_virtual_address) {
475            cs->cst->flags[0] |= RADEON_CS_USE_VM;
476            cs->cst->cs.num_chunks = 3;
477        }
478        if (flags & RADEON_FLUSH_END_OF_FRAME) {
479            cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
480            cs->cst->cs.num_chunks = 3;
481        }
482        if (flags & RADEON_FLUSH_COMPUTE) {
483            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
484            cs->cst->cs.num_chunks = 3;
485        }
486
487        if (cs->thread &&
488            (flags & RADEON_FLUSH_ASYNC)) {
489            cs->flush_started = 1;
490            pipe_semaphore_signal(&cs->flush_queued);
491        } else {
492            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
493        }
494    } else {
495        radeon_cs_context_cleanup(cs->cst);
496    }
497
498    /* Prepare a new CS. */
499    cs->base.buf = cs->csc->buf;
500    cs->base.cdw = 0;
501}
502
503static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
504{
505    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
506    radeon_drm_cs_sync_flush(cs);
507    if (cs->thread) {
508        cs->kill_thread = 1;
509        pipe_semaphore_signal(&cs->flush_queued);
510        pipe_semaphore_wait(&cs->flush_completed);
511        pipe_thread_wait(cs->thread);
512    }
513    pipe_semaphore_destroy(&cs->flush_queued);
514    pipe_semaphore_destroy(&cs->flush_completed);
515    radeon_cs_context_cleanup(&cs->csc1);
516    radeon_cs_context_cleanup(&cs->csc2);
517    p_atomic_dec(&cs->ws->num_cs);
518    radeon_destroy_cs_context(&cs->csc1);
519    radeon_destroy_cs_context(&cs->csc2);
520    FREE(cs);
521}
522
523static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
524                                    void (*flush)(void *ctx, unsigned flags),
525                                    void *user)
526{
527    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
528    cs->flush_cs = flush;
529    cs->flush_data = user;
530}
531
532static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
533                                       struct radeon_winsys_cs_handle *_buf,
534                                       enum radeon_bo_usage usage)
535{
536    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
537    struct radeon_bo *bo = (struct radeon_bo*)_buf;
538    int index;
539
540    if (!bo->num_cs_references)
541        return FALSE;
542
543    index = radeon_get_reloc(cs->csc, bo);
544    if (index == -1)
545        return FALSE;
546
547    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
548        return TRUE;
549    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
550        return TRUE;
551
552    return FALSE;
553}
554
555void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
556{
557    ws->base.cs_create = radeon_drm_cs_create;
558    ws->base.cs_destroy = radeon_drm_cs_destroy;
559    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
560    ws->base.cs_validate = radeon_drm_cs_validate;
561    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
562    ws->base.cs_flush = radeon_drm_cs_flush;
563    ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
564    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
565}
566