1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27/*
28 * Authors:
29 *      Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 *      Aapo Tahkola <aet@rasterburn.org>
33 *      Nicolai Haehnle <prefect_@gmx.net>
34 *      Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37/*
38    This file replaces libdrm's radeon_cs_gem with our own implemention.
39    It's optimized specifically for Radeon DRM.
40    Reloc writes and space checking are faster and simpler than their
41    counterparts in libdrm (the time complexity of all the functions
42    is O(1) in nearly all scenarios, thanks to hashing).
43
44    It works like this:
45
46    cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47    also adds the size of 'buf' to the used_gart and used_vram winsys variables
48    based on the domains, which are simply or'd for the accounting purposes.
49    The adding is skipped if the reloc is already present in the list, but it
50    accounts any newly-referenced domains.
51
52    cs_validate is then called, which just checks:
53        used_vram/gart < vram/gart_size * 0.8
54    The 0.8 number allows for some memory fragmentation. If the validation
55    fails, the pipe driver flushes CS and tries do the validation again,
56    i.e. it validates only that one operation. If it fails again, it drops
57    the operation on the floor and prints some nasty message to stderr.
58    (done in the pipe driver)
59
60    cs_write_reloc(cs, buf) just writes a reloc that has been added using
61    cs_add_reloc. The read_domain and write_domain parameters have been removed,
62    because we already specify them in cs_add_reloc.
63*/
64
65#include "radeon_drm_cs.h"
66
67#include "util/u_memory.h"
68
69#include <stdio.h>
70#include <stdlib.h>
71#include <stdint.h>
72#include <xf86drm.h>
73
74/*
75 * this are copy from radeon_drm, once an updated libdrm is released
76 * we should bump configure.ac requirement for it and remove the following
77 * field
78 */
79#ifndef RADEON_CHUNK_ID_FLAGS
80#define RADEON_CHUNK_ID_FLAGS       0x03
81
82/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */
83#define RADEON_CS_KEEP_TILING_FLAGS 0x01
84
85
86#endif
87
88#ifndef RADEON_CS_USE_VM
89#define RADEON_CS_USE_VM            0x02
90/* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */
91#define RADEON_CS_RING_GFX          0
92#define RADEON_CS_RING_COMPUTE      1
93#endif
94
95
96#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
97
98static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
99                                      struct radeon_drm_winsys *ws)
100{
101    csc->fd = ws->fd;
102    csc->nrelocs = 512;
103    csc->relocs_bo = (struct radeon_bo**)
104                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
105    if (!csc->relocs_bo) {
106        return FALSE;
107    }
108
109    csc->relocs = (struct drm_radeon_cs_reloc*)
110                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
111    if (!csc->relocs) {
112        FREE(csc->relocs_bo);
113        return FALSE;
114    }
115
116    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117    csc->chunks[0].length_dw = 0;
118    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120    csc->chunks[1].length_dw = 0;
121    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123    csc->chunks[2].length_dw = 2;
124    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125
126    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129
130    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131    return TRUE;
132}
133
134static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
135{
136    unsigned i;
137
138    for (i = 0; i < csc->crelocs; i++) {
139        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
140        radeon_bo_reference(&csc->relocs_bo[i], NULL);
141    }
142
143    csc->crelocs = 0;
144    csc->validated_crelocs = 0;
145    csc->chunks[0].length_dw = 0;
146    csc->chunks[1].length_dw = 0;
147    csc->used_gart = 0;
148    csc->used_vram = 0;
149    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
150}
151
152static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
153{
154    radeon_cs_context_cleanup(csc);
155    FREE(csc->relocs_bo);
156    FREE(csc->relocs);
157}
158
159DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
160static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
161
162static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
163{
164    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
165    struct radeon_drm_cs *cs;
166
167    cs = CALLOC_STRUCT(radeon_drm_cs);
168    if (!cs) {
169        return NULL;
170    }
171    pipe_semaphore_init(&cs->flush_queued, 0);
172    pipe_semaphore_init(&cs->flush_completed, 0);
173
174    cs->ws = ws;
175
176    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
177        FREE(cs);
178        return NULL;
179    }
180    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
181        radeon_destroy_cs_context(&cs->csc1);
182        FREE(cs);
183        return NULL;
184    }
185
186    /* Set the first command buffer as current. */
187    cs->csc = &cs->csc1;
188    cs->cst = &cs->csc2;
189    cs->base.buf = cs->csc->buf;
190
191    p_atomic_inc(&ws->num_cs);
192    if (cs->ws->num_cpus > 1 && debug_get_option_thread())
193        cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
194    return &cs->base;
195}
196
197#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
198
199static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
200                                        enum radeon_bo_domain rd,
201                                        enum radeon_bo_domain wd,
202                                        enum radeon_bo_domain *added_domains)
203{
204    *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
205
206    reloc->read_domains |= rd;
207    reloc->write_domain |= wd;
208}
209
210int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
211{
212    struct drm_radeon_cs_reloc *reloc;
213    unsigned i;
214    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
215
216    if (csc->is_handle_added[hash]) {
217        i = csc->reloc_indices_hashlist[hash];
218        reloc = &csc->relocs[i];
219        if (reloc->handle == bo->handle) {
220            return i;
221        }
222
223        /* Hash collision, look for the BO in the list of relocs linearly. */
224        for (i = csc->crelocs; i != 0;) {
225            --i;
226            reloc = &csc->relocs[i];
227            if (reloc->handle == bo->handle) {
228                /* Put this reloc in the hash list.
229                 * This will prevent additional hash collisions if there are
230                 * several consecutive get_reloc calls for the same buffer.
231                 *
232                 * Example: Assuming buffers A,B,C collide in the hash list,
233                 * the following sequence of relocs:
234                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
235                 * will collide here: ^ and here:   ^,
236                 * meaning that we should get very few collisions in the end. */
237                csc->reloc_indices_hashlist[hash] = i;
238                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
239                return i;
240            }
241        }
242    }
243
244    return -1;
245}
246
247static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
248                                 struct radeon_bo *bo,
249                                 enum radeon_bo_usage usage,
250                                 enum radeon_bo_domain domains,
251                                 enum radeon_bo_domain *added_domains)
252{
253    struct drm_radeon_cs_reloc *reloc;
254    unsigned i;
255    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
256    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
257    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
258
259    if (csc->is_handle_added[hash]) {
260        i = csc->reloc_indices_hashlist[hash];
261        reloc = &csc->relocs[i];
262        if (reloc->handle == bo->handle) {
263            update_reloc_domains(reloc, rd, wd, added_domains);
264            return i;
265        }
266
267        /* Hash collision, look for the BO in the list of relocs linearly. */
268        for (i = csc->crelocs; i != 0;) {
269            --i;
270            reloc = &csc->relocs[i];
271            if (reloc->handle == bo->handle) {
272                update_reloc_domains(reloc, rd, wd, added_domains);
273
274                csc->reloc_indices_hashlist[hash] = i;
275                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
276                return i;
277            }
278        }
279    }
280
281    /* New relocation, check if the backing array is large enough. */
282    if (csc->crelocs >= csc->nrelocs) {
283        uint32_t size;
284        csc->nrelocs += 10;
285
286        size = csc->nrelocs * sizeof(struct radeon_bo*);
287        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
288
289        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
290        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
291
292        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
293    }
294
295    /* Initialize the new relocation. */
296    csc->relocs_bo[csc->crelocs] = NULL;
297    radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
298    p_atomic_inc(&bo->num_cs_references);
299    reloc = &csc->relocs[csc->crelocs];
300    reloc->handle = bo->handle;
301    reloc->read_domains = rd;
302    reloc->write_domain = wd;
303    reloc->flags = 0;
304
305    csc->is_handle_added[hash] = TRUE;
306    csc->reloc_indices_hashlist[hash] = csc->crelocs;
307
308    csc->chunks[1].length_dw += RELOC_DWORDS;
309
310    *added_domains = rd | wd;
311    return csc->crelocs++;
312}
313
314static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
315                                        struct radeon_winsys_cs_handle *buf,
316                                        enum radeon_bo_usage usage,
317                                        enum radeon_bo_domain domains)
318{
319    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
320    struct radeon_bo *bo = (struct radeon_bo*)buf;
321    enum radeon_bo_domain added_domains;
322
323    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
324
325    if (added_domains & RADEON_DOMAIN_GTT)
326        cs->csc->used_gart += bo->base.size;
327    if (added_domains & RADEON_DOMAIN_VRAM)
328        cs->csc->used_vram += bo->base.size;
329
330    return index;
331}
332
333static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
334{
335    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
336    boolean status =
337        cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
338        cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
339
340    if (status) {
341        cs->csc->validated_crelocs = cs->csc->crelocs;
342    } else {
343        /* Remove lately-added relocations. The validation failed with them
344         * and the CS is about to be flushed because of that. Keep only
345         * the already-validated relocations. */
346        unsigned i;
347
348        for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
349            p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
350            radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
351        }
352        cs->csc->crelocs = cs->csc->validated_crelocs;
353
354        /* Flush if there are any relocs. Clean up otherwise. */
355        if (cs->csc->crelocs) {
356            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
357        } else {
358            radeon_cs_context_cleanup(cs->csc);
359
360            assert(cs->base.cdw == 0);
361            if (cs->base.cdw != 0) {
362                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
363            }
364        }
365    }
366    return status;
367}
368
369static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
370{
371    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
372    boolean status =
373        (cs->csc->used_gart + gtt) < cs->ws->info.gart_size * 0.7 &&
374        (cs->csc->used_vram + vram) < cs->ws->info.vram_size * 0.7;
375
376    return status;
377}
378
379static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
380                                      struct radeon_winsys_cs_handle *buf)
381{
382    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
383    struct radeon_bo *bo = (struct radeon_bo*)buf;
384
385    unsigned index = radeon_get_reloc(cs->csc, bo);
386
387    if (index == -1) {
388        fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
389        return;
390    }
391
392    OUT_CS(&cs->base, 0xc0001000);
393    OUT_CS(&cs->base, index * RELOC_DWORDS);
394}
395
396static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
397{
398    unsigned i;
399
400    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
401                            &csc->cs, sizeof(struct drm_radeon_cs))) {
402        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
403            unsigned i;
404
405            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
406            for (i = 0; i < csc->chunks[0].length_dw; i++) {
407                fprintf(stderr, "0x%08X\n", csc->buf[i]);
408            }
409        } else {
410            fprintf(stderr, "radeon: The kernel rejected CS, "
411                    "see dmesg for more information.\n");
412        }
413    }
414
415    for (i = 0; i < csc->crelocs; i++)
416        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
417
418    radeon_cs_context_cleanup(csc);
419}
420
421static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
422{
423    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
424
425    while (1) {
426        pipe_semaphore_wait(&cs->flush_queued);
427        if (cs->kill_thread)
428            break;
429        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
430        pipe_semaphore_signal(&cs->flush_completed);
431    }
432    pipe_semaphore_signal(&cs->flush_completed);
433    return NULL;
434}
435
436void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
437{
438    /* Wait for any pending ioctl to complete. */
439    if (cs->thread && cs->flush_started) {
440        pipe_semaphore_wait(&cs->flush_completed);
441        cs->flush_started = 0;
442    }
443}
444
445static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
446{
447    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
448    struct radeon_cs_context *tmp;
449
450    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
451       fprintf(stderr, "radeon: command stream overflowed\n");
452    }
453
454    radeon_drm_cs_sync_flush(cs);
455
456    /* Flip command streams. */
457    tmp = cs->csc;
458    cs->csc = cs->cst;
459    cs->cst = tmp;
460
461    /* If the CS is not empty or overflowed, emit it in a separate thread. */
462    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS) {
463        unsigned i, crelocs = cs->cst->crelocs;
464
465        cs->cst->chunks[0].length_dw = cs->base.cdw;
466
467        for (i = 0; i < crelocs; i++) {
468            /* Update the number of active asynchronous CS ioctls for the buffer. */
469            p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
470        }
471
472        cs->cst->flags[0] = 0;
473        cs->cst->flags[1] = RADEON_CS_RING_GFX;
474        cs->cst->cs.num_chunks = 2;
475        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
476            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
477            cs->cst->cs.num_chunks = 3;
478        }
479        if (cs->ws->info.r600_virtual_address) {
480            cs->cst->flags[0] |= RADEON_CS_USE_VM;
481            cs->cst->cs.num_chunks = 3;
482        }
483        if (flags & RADEON_FLUSH_COMPUTE) {
484            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
485            cs->cst->cs.num_chunks = 3;
486        }
487
488        if (cs->thread &&
489            (flags & RADEON_FLUSH_ASYNC)) {
490            cs->flush_started = 1;
491            pipe_semaphore_signal(&cs->flush_queued);
492        } else {
493            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
494        }
495    } else {
496        radeon_cs_context_cleanup(cs->cst);
497    }
498
499    /* Prepare a new CS. */
500    cs->base.buf = cs->csc->buf;
501    cs->base.cdw = 0;
502}
503
504static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
505{
506    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
507    radeon_drm_cs_sync_flush(cs);
508    if (cs->thread) {
509        cs->kill_thread = 1;
510        pipe_semaphore_signal(&cs->flush_queued);
511        pipe_semaphore_wait(&cs->flush_completed);
512        pipe_thread_wait(cs->thread);
513    }
514    pipe_semaphore_destroy(&cs->flush_queued);
515    pipe_semaphore_destroy(&cs->flush_completed);
516    radeon_cs_context_cleanup(&cs->csc1);
517    radeon_cs_context_cleanup(&cs->csc2);
518    p_atomic_dec(&cs->ws->num_cs);
519    radeon_destroy_cs_context(&cs->csc1);
520    radeon_destroy_cs_context(&cs->csc2);
521    FREE(cs);
522}
523
524static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
525                                    void (*flush)(void *ctx, unsigned flags),
526                                    void *user)
527{
528    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
529    cs->flush_cs = flush;
530    cs->flush_data = user;
531}
532
533static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
534                                       struct radeon_winsys_cs_handle *_buf,
535                                       enum radeon_bo_usage usage)
536{
537    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
538    struct radeon_bo *bo = (struct radeon_bo*)_buf;
539    int index;
540
541    if (!bo->num_cs_references)
542        return FALSE;
543
544    index = radeon_get_reloc(cs->csc, bo);
545    if (index == -1)
546        return FALSE;
547
548    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
549        return TRUE;
550    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
551        return TRUE;
552
553    return FALSE;
554}
555
556void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
557{
558    ws->base.cs_create = radeon_drm_cs_create;
559    ws->base.cs_destroy = radeon_drm_cs_destroy;
560    ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
561    ws->base.cs_validate = radeon_drm_cs_validate;
562    ws->base.cs_memory_below_limit = radeon_drm_cs_memory_below_limit;
563    ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
564    ws->base.cs_flush = radeon_drm_cs_flush;
565    ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
566    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
567}
568