1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jerome Glisse 25 */ 26 27#include "sid.h" 28#include "si_pipe.h" 29 30#include "util/u_format.h" 31 32static void si_dma_copy_buffer(struct si_context *ctx, 33 struct pipe_resource *dst, 34 struct pipe_resource *src, 35 uint64_t dst_offset, 36 uint64_t src_offset, 37 uint64_t size) 38{ 39 struct radeon_winsys_cs *cs = ctx->b.dma.cs; 40 unsigned i, ncopy, count, max_size, sub_cmd, shift; 41 struct r600_resource *rdst = (struct r600_resource*)dst; 42 struct r600_resource *rsrc = (struct r600_resource*)src; 43 44 /* Mark the buffer range of destination as valid (initialized), 45 * so that transfer_map knows it should wait for the GPU when mapping 46 * that range. */ 47 util_range_add(&rdst->valid_buffer_range, dst_offset, 48 dst_offset + size); 49 50 dst_offset += rdst->gpu_address; 51 src_offset += rsrc->gpu_address; 52 53 /* see whether we should use the dword-aligned or byte-aligned copy */ 54 if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { 55 sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; 56 shift = 2; 57 max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; 58 } else { 59 sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; 60 shift = 0; 61 max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; 62 } 63 64 ncopy = DIV_ROUND_UP(size, max_size); 65 r600_need_dma_space(&ctx->b, ncopy * 5, rdst, rsrc); 66 67 for (i = 0; i < ncopy; i++) { 68 count = MIN2(size, max_size); 69 radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, 70 count >> shift)); 71 radeon_emit(cs, dst_offset); 72 radeon_emit(cs, src_offset); 73 radeon_emit(cs, (dst_offset >> 32UL) & 0xff); 74 radeon_emit(cs, (src_offset >> 32UL) & 0xff); 75 dst_offset += count; 76 src_offset += count; 77 size -= count; 78 } 79} 80 81static void si_dma_clear_buffer(struct pipe_context *ctx, 82 struct pipe_resource *dst, 83 uint64_t offset, 84 uint64_t size, 85 unsigned clear_value) 86{ 87 struct si_context *sctx = (struct si_context *)ctx; 88 struct radeon_winsys_cs *cs = sctx->b.dma.cs; 89 unsigned i, ncopy, csize; 90 struct r600_resource *rdst = r600_resource(dst); 91 92 if (!cs || offset % 4 != 0 || size % 4 != 0) { 93 ctx->clear_buffer(ctx, dst, offset, size, &clear_value, 4); 94 return; 95 } 96 97 /* Mark the buffer range of destination as valid (initialized), 98 * so that transfer_map knows it should wait for the GPU when mapping 99 * that range. */ 100 util_range_add(&rdst->valid_buffer_range, offset, offset + size); 101 102 offset += rdst->gpu_address; 103 104 /* the same maximum size as for copying */ 105 ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 106 r600_need_dma_space(&sctx->b, ncopy * 4, rdst, NULL); 107 108 for (i = 0; i < ncopy; i++) { 109 csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 110 radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, 111 csize / 4)); 112 radeon_emit(cs, offset); 113 radeon_emit(cs, clear_value); 114 radeon_emit(cs, (offset >> 32) << 16); 115 offset += csize; 116 size -= csize; 117 } 118} 119 120static void si_dma_copy_tile(struct si_context *ctx, 121 struct pipe_resource *dst, 122 unsigned dst_level, 123 unsigned dst_x, 124 unsigned dst_y, 125 unsigned dst_z, 126 struct pipe_resource *src, 127 unsigned src_level, 128 unsigned src_x, 129 unsigned src_y, 130 unsigned src_z, 131 unsigned copy_height, 132 unsigned pitch, 133 unsigned bpp) 134{ 135 struct radeon_winsys_cs *cs = ctx->b.dma.cs; 136 struct r600_texture *rsrc = (struct r600_texture*)src; 137 struct r600_texture *rdst = (struct r600_texture*)dst; 138 unsigned dst_mode = rdst->surface.level[dst_level].mode; 139 bool detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED; 140 struct r600_texture *rlinear = detile ? rdst : rsrc; 141 struct r600_texture *rtiled = detile ? rsrc : rdst; 142 unsigned linear_lvl = detile ? dst_level : src_level; 143 unsigned tiled_lvl = detile ? src_level : dst_level; 144 struct radeon_info *info = &ctx->screen->b.info; 145 unsigned index = rtiled->surface.tiling_index[tiled_lvl]; 146 unsigned tile_mode = info->si_tile_mode_array[index]; 147 unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; 148 unsigned ncopy, height, cheight, i; 149 unsigned linear_x, linear_y, linear_z, tiled_x, tiled_y, tiled_z; 150 unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt; 151 uint64_t base, addr; 152 unsigned pipe_config; 153 154 assert(dst_mode != rsrc->surface.level[src_level].mode); 155 156 sub_cmd = SI_DMA_COPY_TILED; 157 lbpp = util_logbase2(bpp); 158 pitch_tile_max = ((pitch / bpp) / 8) - 1; 159 160 linear_x = detile ? dst_x : src_x; 161 linear_y = detile ? dst_y : src_y; 162 linear_z = detile ? dst_z : src_z; 163 tiled_x = detile ? src_x : dst_x; 164 tiled_y = detile ? src_y : dst_y; 165 tiled_z = detile ? src_z : dst_z; 166 167 assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format)); 168 169 array_mode = G_009910_ARRAY_MODE(tile_mode); 170 slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x * 171 rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1; 172 /* linear height must be the same as the slice tile max height, it's ok even 173 * if the linear destination/source have smaller heigh as the size of the 174 * dma packet will be using the copy_height which is always smaller or equal 175 * to the linear height 176 */ 177 height = rtiled->surface.level[tiled_lvl].nblk_y; 178 base = rtiled->surface.level[tiled_lvl].offset; 179 addr = rlinear->surface.level[linear_lvl].offset; 180 addr += rlinear->surface.level[linear_lvl].slice_size * linear_z; 181 addr += linear_y * pitch + linear_x * bpp; 182 bank_h = G_009910_BANK_HEIGHT(tile_mode); 183 bank_w = G_009910_BANK_WIDTH(tile_mode); 184 mt_aspect = G_009910_MACRO_TILE_ASPECT(tile_mode); 185 /* Non-depth modes don't have TILE_SPLIT set. */ 186 tile_split = util_logbase2(rtiled->surface.tile_split >> 6); 187 nbanks = G_009910_NUM_BANKS(tile_mode); 188 base += rtiled->resource.gpu_address; 189 addr += rlinear->resource.gpu_address; 190 191 pipe_config = G_009910_PIPE_CONFIG(tile_mode); 192 mt = G_009910_MICRO_TILE_MODE(tile_mode); 193 size = copy_height * pitch; 194 ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); 195 r600_need_dma_space(&ctx->b, ncopy * 9, &rdst->resource, &rsrc->resource); 196 197 for (i = 0; i < ncopy; i++) { 198 cheight = copy_height; 199 if (cheight * pitch > SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE) { 200 cheight = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE / pitch; 201 } 202 size = cheight * pitch; 203 radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size / 4)); 204 radeon_emit(cs, base >> 8); 205 radeon_emit(cs, (detile << 31) | (array_mode << 27) | 206 (lbpp << 24) | (bank_h << 21) | 207 (bank_w << 18) | (mt_aspect << 16)); 208 radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16)); 209 radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26)); 210 radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18)); 211 radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27)); 212 radeon_emit(cs, addr & 0xfffffffc); 213 radeon_emit(cs, (addr >> 32UL) & 0xff); 214 copy_height -= cheight; 215 addr += cheight * pitch; 216 tiled_y += cheight; 217 } 218} 219 220static void si_dma_copy(struct pipe_context *ctx, 221 struct pipe_resource *dst, 222 unsigned dst_level, 223 unsigned dstx, unsigned dsty, unsigned dstz, 224 struct pipe_resource *src, 225 unsigned src_level, 226 const struct pipe_box *src_box) 227{ 228 struct si_context *sctx = (struct si_context *)ctx; 229 struct r600_texture *rsrc = (struct r600_texture*)src; 230 struct r600_texture *rdst = (struct r600_texture*)dst; 231 unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode; 232 unsigned src_w, dst_w; 233 unsigned src_x, src_y; 234 unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; 235 236 if (sctx->b.dma.cs == NULL) { 237 goto fallback; 238 } 239 240 if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { 241 si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width); 242 return; 243 } 244 245 /* XXX: Using the asynchronous DMA engine for multi-dimensional 246 * operations seems to cause random GPU lockups for various people. 247 * While the root cause for this might need to be fixed in the kernel, 248 * let's disable it for now. 249 * 250 * Before re-enabling this, please make sure you can hit all newly 251 * enabled paths in your testing, preferably with both piglit and real 252 * world apps, and get in touch with people on the bug reports below 253 * for stability testing. 254 * 255 * https://bugs.freedesktop.org/show_bug.cgi?id=85647 256 * https://bugs.freedesktop.org/show_bug.cgi?id=83500 257 */ 258 goto fallback; 259 260 if (src_box->depth > 1 || 261 !r600_prepare_for_dma_blit(&sctx->b, rdst, dst_level, dstx, dsty, 262 dstz, rsrc, src_level, src_box)) 263 goto fallback; 264 265 src_x = util_format_get_nblocksx(src->format, src_box->x); 266 dst_x = util_format_get_nblocksx(src->format, dst_x); 267 src_y = util_format_get_nblocksy(src->format, src_box->y); 268 dst_y = util_format_get_nblocksy(src->format, dst_y); 269 270 bpp = rdst->surface.bpe; 271 dst_pitch = rdst->surface.level[dst_level].nblk_x * rdst->surface.bpe; 272 src_pitch = rsrc->surface.level[src_level].nblk_x * rsrc->surface.bpe; 273 src_w = u_minify(rsrc->resource.b.b.width0, src_level); 274 dst_w = u_minify(rdst->resource.b.b.width0, dst_level); 275 276 dst_mode = rdst->surface.level[dst_level].mode; 277 src_mode = rsrc->surface.level[src_level].mode; 278 279 if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w || 280 src_box->width != src_w || 281 src_box->height != u_minify(rsrc->resource.b.b.height0, src_level) || 282 src_box->height != u_minify(rdst->resource.b.b.height0, dst_level) || 283 rsrc->surface.level[src_level].nblk_y != 284 rdst->surface.level[dst_level].nblk_y) { 285 /* FIXME si can do partial blit */ 286 goto fallback; 287 } 288 /* the x test here are currently useless (because we don't support partial blit) 289 * but keep them around so we don't forget about those 290 */ 291 if ((src_pitch % 8) || (src_box->x % 8) || (dst_x % 8) || 292 (src_box->y % 8) || (dst_y % 8) || (src_box->height % 8)) { 293 goto fallback; 294 } 295 296 if (src_mode == dst_mode) { 297 uint64_t dst_offset, src_offset; 298 /* simple dma blit would do NOTE code here assume : 299 * src_box.x/y == 0 300 * dst_x/y == 0 301 * dst_pitch == src_pitch 302 */ 303 src_offset= rsrc->surface.level[src_level].offset; 304 src_offset += rsrc->surface.level[src_level].slice_size * src_box->z; 305 src_offset += src_y * src_pitch + src_x * bpp; 306 dst_offset = rdst->surface.level[dst_level].offset; 307 dst_offset += rdst->surface.level[dst_level].slice_size * dst_z; 308 dst_offset += dst_y * dst_pitch + dst_x * bpp; 309 si_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, 310 rsrc->surface.level[src_level].slice_size); 311 } else { 312 si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z, 313 src, src_level, src_x, src_y, src_box->z, 314 src_box->height / rsrc->surface.blk_h, 315 dst_pitch, bpp); 316 } 317 return; 318 319fallback: 320 si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, 321 src, src_level, src_box); 322} 323 324void si_init_dma_functions(struct si_context *sctx) 325{ 326 sctx->b.dma_copy = si_dma_copy; 327 sctx->b.dma_clear_buffer = si_dma_clear_buffer; 328} 329