brw_blorp_blit.cpp revision eac4f1a70772c1480778bae2563199c12634893e
1/* 2 * Copyright © 2012 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "main/teximage.h" 25 26#include "glsl/ralloc.h" 27 28#include "intel_fbo.h" 29 30#include "brw_blorp.h" 31#include "brw_context.h" 32#include "brw_eu.h" 33#include "brw_state.h" 34 35 36/** 37 * Helper function for handling mirror image blits. 38 * 39 * If coord0 > coord1, swap them and invert the "mirror" boolean. 40 */ 41static inline void 42fixup_mirroring(bool &mirror, GLint &coord0, GLint &coord1) 43{ 44 if (coord0 > coord1) { 45 mirror = !mirror; 46 GLint tmp = coord0; 47 coord0 = coord1; 48 coord1 = tmp; 49 } 50} 51 52 53/** 54 * Adjust {src,dst}_x{0,1} to account for clipping and scissoring of 55 * destination coordinates. 56 * 57 * Return true if there is still blitting to do, false if all pixels got 58 * rejected by the clip and/or scissor. 59 * 60 * For clarity, the nomenclature of this function assumes we are clipping and 61 * scissoring the X coordinate; the exact same logic applies for Y 62 * coordinates. 63 * 64 * Note: this function may also be used to account for clipping of source 65 * coordinates, by swapping the roles of src and dst. 66 */ 67static inline bool 68clip_or_scissor(bool mirror, GLint &src_x0, GLint &src_x1, GLint &dst_x0, 69 GLint &dst_x1, GLint fb_xmin, GLint fb_xmax) 70{ 71 /* If we are going to scissor everything away, stop. */ 72 if (!(fb_xmin < fb_xmax && 73 dst_x0 < fb_xmax && 74 fb_xmin < dst_x1 && 75 dst_x0 < dst_x1)) { 76 return false; 77 } 78 79 /* Clip the destination rectangle, and keep track of how many pixels we 80 * clipped off of the left and right sides of it. 81 */ 82 GLint pixels_clipped_left = 0; 83 GLint pixels_clipped_right = 0; 84 if (dst_x0 < fb_xmin) { 85 pixels_clipped_left = fb_xmin - dst_x0; 86 dst_x0 = fb_xmin; 87 } 88 if (fb_xmax < dst_x1) { 89 pixels_clipped_right = dst_x1 - fb_xmax; 90 dst_x1 = fb_xmax; 91 } 92 93 /* If we are mirrored, then before applying pixels_clipped_{left,right} to 94 * the source coordinates, we need to flip them to account for the 95 * mirroring. 96 */ 97 if (mirror) { 98 GLint tmp = pixels_clipped_left; 99 pixels_clipped_left = pixels_clipped_right; 100 pixels_clipped_right = tmp; 101 } 102 103 /* Adjust the source rectangle to remove the pixels corresponding to those 104 * that were clipped/scissored out of the destination rectangle. 105 */ 106 src_x0 += pixels_clipped_left; 107 src_x1 -= pixels_clipped_right; 108 109 return true; 110} 111 112 113static bool 114try_blorp_blit(struct intel_context *intel, 115 GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, 116 GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, 117 GLenum filter, GLbitfield buffer_bit) 118{ 119 struct gl_context *ctx = &intel->ctx; 120 121 /* Sync up the state of window system buffers. We need to do this before 122 * we go looking for the buffers. 123 */ 124 intel_prepare_render(intel); 125 126 /* Find buffers */ 127 const struct gl_framebuffer *read_fb = ctx->ReadBuffer; 128 const struct gl_framebuffer *draw_fb = ctx->DrawBuffer; 129 struct gl_renderbuffer *src_rb; 130 struct gl_renderbuffer *dst_rb; 131 switch (buffer_bit) { 132 case GL_COLOR_BUFFER_BIT: 133 src_rb = read_fb->_ColorReadBuffer; 134 dst_rb = 135 draw_fb->Attachment[ 136 draw_fb->_ColorDrawBufferIndexes[0]].Renderbuffer; 137 break; 138 case GL_DEPTH_BUFFER_BIT: 139 src_rb = read_fb->Attachment[BUFFER_DEPTH].Renderbuffer; 140 dst_rb = draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer; 141 break; 142 case GL_STENCIL_BUFFER_BIT: 143 src_rb = read_fb->Attachment[BUFFER_STENCIL].Renderbuffer; 144 dst_rb = draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer; 145 break; 146 default: 147 assert(false); 148 } 149 150 /* Find source miptree */ 151 struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb); 152 struct intel_mipmap_tree *src_mt = src_irb->mt; 153 if (buffer_bit == GL_STENCIL_BUFFER_BIT && src_mt->stencil_mt) 154 src_mt = src_mt->stencil_mt; 155 156 /* Find destination miptree */ 157 struct intel_renderbuffer *dst_irb = intel_renderbuffer(dst_rb); 158 struct intel_mipmap_tree *dst_mt = dst_irb->mt; 159 if (buffer_bit == GL_STENCIL_BUFFER_BIT && dst_mt->stencil_mt) 160 dst_mt = dst_mt->stencil_mt; 161 162 /* Blorp blits can't translate from one format to another. For that we'll 163 * have to fall back to the meta-op blit. Note: the meta-op blit doesn't 164 * support multisampled blits, but fortunately this is ok because 165 * multisampled blits require identical source and destination formats. 166 */ 167 if (src_mt->format != dst_mt->format) 168 return false; 169 170 /* Detect if the blit needs to be mirrored */ 171 bool mirror_x = false, mirror_y = false; 172 fixup_mirroring(mirror_x, srcX0, srcX1); 173 fixup_mirroring(mirror_x, dstX0, dstX1); 174 fixup_mirroring(mirror_y, srcY0, srcY1); 175 fixup_mirroring(mirror_y, dstY0, dstY1); 176 177 /* Make sure width and height match */ 178 if (srcX1 - srcX0 != dstX1 - dstX0) return false; 179 if (srcY1 - srcY0 != dstY1 - dstY0) return false; 180 181 /* If the destination rectangle needs to be clipped or scissored, do so. 182 */ 183 if (!(clip_or_scissor(mirror_x, srcX0, srcX1, dstX0, dstX1, 184 draw_fb->_Xmin, draw_fb->_Xmax) && 185 clip_or_scissor(mirror_y, srcY0, srcY1, dstY0, dstY1, 186 draw_fb->_Ymin, draw_fb->_Ymax))) { 187 /* Everything got clipped/scissored away, so the blit was successful. */ 188 return true; 189 } 190 191 /* If the source rectangle needs to be clipped or scissored, do so. */ 192 if (!(clip_or_scissor(mirror_x, dstX0, dstX1, srcX0, srcX1, 193 0, read_fb->Width) && 194 clip_or_scissor(mirror_y, dstY0, dstY1, srcY0, srcY1, 195 0, read_fb->Height))) { 196 /* Everything got clipped/scissored away, so the blit was successful. */ 197 return true; 198 } 199 200 /* Account for the fact that in the system framebuffer, the origin is at 201 * the lower left. 202 */ 203 if (read_fb->Name == 0) { 204 GLint tmp = read_fb->Height - srcY0; 205 srcY0 = read_fb->Height - srcY1; 206 srcY1 = tmp; 207 mirror_y = !mirror_y; 208 } 209 if (draw_fb->Name == 0) { 210 GLint tmp = draw_fb->Height - dstY0; 211 dstY0 = draw_fb->Height - dstY1; 212 dstY1 = tmp; 213 mirror_y = !mirror_y; 214 } 215 216 /* Get ready to blit. This includes depth resolving the src and dst 217 * buffers if necessary. 218 */ 219 intel_renderbuffer_resolve_depth(intel, src_irb); 220 intel_renderbuffer_resolve_depth(intel, dst_irb); 221 222 /* Do the blit */ 223 brw_blorp_blit_params params(brw_context(ctx), src_mt, dst_mt, 224 srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, 225 mirror_x, mirror_y); 226 brw_blorp_exec(intel, ¶ms); 227 228 /* Mark the dst buffer as needing a HiZ resolve if necessary. */ 229 intel_renderbuffer_set_needs_hiz_resolve(dst_irb); 230 231 return true; 232} 233 234GLbitfield 235brw_blorp_framebuffer(struct intel_context *intel, 236 GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, 237 GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, 238 GLbitfield mask, GLenum filter) 239{ 240 /* BLORP is not supported before Gen6. */ 241 if (intel->gen < 6) 242 return mask; 243 244 static GLbitfield buffer_bits[] = { 245 GL_COLOR_BUFFER_BIT, 246 GL_DEPTH_BUFFER_BIT, 247 GL_STENCIL_BUFFER_BIT, 248 }; 249 250 for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) { 251 if ((mask & buffer_bits[i]) && 252 try_blorp_blit(intel, 253 srcX0, srcY0, srcX1, srcY1, 254 dstX0, dstY0, dstX1, dstY1, 255 filter, buffer_bits[i])) { 256 mask &= ~buffer_bits[i]; 257 } 258 } 259 260 return mask; 261} 262 263 264/** 265 * Enum to specify the order of arguments in a sampler message 266 */ 267enum sampler_message_arg 268{ 269 SAMPLER_MESSAGE_ARG_U_FLOAT, 270 SAMPLER_MESSAGE_ARG_V_FLOAT, 271 SAMPLER_MESSAGE_ARG_U_INT, 272 SAMPLER_MESSAGE_ARG_V_INT, 273 SAMPLER_MESSAGE_ARG_SI_INT, 274 SAMPLER_MESSAGE_ARG_MCS_INT, 275 SAMPLER_MESSAGE_ARG_ZERO_INT, 276}; 277 278/** 279 * Generator for WM programs used in BLORP blits. 280 * 281 * The bulk of the work done by the WM program is to wrap and unwrap the 282 * coordinate transformations used by the hardware to store surfaces in 283 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the 284 * sample index for a multisampled surface) to a memory offset by the 285 * following formulas: 286 * 287 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S)) 288 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset)) 289 * 290 * For a single-sampled surface, or for a multisampled surface using 291 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity 292 * function: 293 * 294 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 295 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 296 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S) 297 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S) 298 * 299 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 300 * embeds the sample number into bit 1 of the X and Y coordinates: 301 * 302 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 303 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 304 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1) 305 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 306 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 307 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 308 * S = (Y & 0b10) | (X & 0b10) >> 1 309 * 310 * For X tiling, tile() combines together the low-order bits of the X and Y 311 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512 312 * bytes wide and 8 rows high: 313 * 314 * tile(x_tiled, X, Y, S) = A 315 * where A = tile_num << 12 | offset 316 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9) 317 * offset = (Y' & 0b111) << 9 318 * | (X & 0b111111111) 319 * X' = X * cpp 320 * Y' = Y + S * qpitch 321 * detile(x_tiled, A) = (X, Y, S) 322 * where X = X' / cpp 323 * Y = Y' % qpitch 324 * S = Y' / qpitch 325 * Y' = (tile_num / tile_pitch) << 3 326 * | (A & 0b111000000000) >> 9 327 * X' = (tile_num % tile_pitch) << 9 328 * | (A & 0b111111111) 329 * 330 * (In all tiling formulas, cpp is the number of bytes occupied by a single 331 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required 332 * to fill the width of the surface, and qpitch is the spacing (in rows) 333 * between array slices). 334 * 335 * For Y tiling, tile() combines together the low-order bits of the X and Y 336 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128 337 * bytes wide and 32 rows high: 338 * 339 * tile(y_tiled, X, Y, S) = A 340 * where A = tile_num << 12 | offset 341 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7) 342 * offset = (X' & 0b1110000) << 5 343 * | (Y' & 0b11111) << 4 344 * | (X' & 0b1111) 345 * X' = X * cpp 346 * Y' = Y + S * qpitch 347 * detile(y_tiled, A) = (X, Y, S) 348 * where X = X' / cpp 349 * Y = Y' % qpitch 350 * S = Y' / qpitch 351 * Y' = (tile_num / tile_pitch) << 5 352 * | (A & 0b111110000) >> 4 353 * X' = (tile_num % tile_pitch) << 7 354 * | (A & 0b111000000000) >> 5 355 * | (A & 0b1111) 356 * 357 * For W tiling, tile() combines together the low-order bits of the X and Y 358 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64 359 * bytes wide and 64 rows high (note that W tiling is only used for stencil 360 * buffers, which always have cpp = 1 and S=0): 361 * 362 * tile(w_tiled, X, Y, S) = A 363 * where A = tile_num << 12 | offset 364 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6) 365 * offset = (X' & 0b111000) << 6 366 * | (Y' & 0b111100) << 3 367 * | (X' & 0b100) << 2 368 * | (Y' & 0b10) << 2 369 * | (X' & 0b10) << 1 370 * | (Y' & 0b1) << 1 371 * | (X' & 0b1) 372 * X' = X * cpp = X 373 * Y' = Y + S * qpitch 374 * detile(w_tiled, A) = (X, Y, S) 375 * where X = X' / cpp = X' 376 * Y = Y' % qpitch = Y' 377 * S = Y / qpitch = 0 378 * Y' = (tile_num / tile_pitch) << 6 379 * | (A & 0b111100000) >> 3 380 * | (A & 0b1000) >> 2 381 * | (A & 0b10) >> 1 382 * X' = (tile_num % tile_pitch) << 6 383 * | (A & 0b111000000000) >> 6 384 * | (A & 0b10000) >> 2 385 * | (A & 0b100) >> 1 386 * | (A & 0b1) 387 * 388 * Finally, for a non-tiled surface, tile() simply combines together the X and 389 * Y coordinates in the natural way: 390 * 391 * tile(untiled, X, Y, S) = A 392 * where A = Y * pitch + X' 393 * X' = X * cpp 394 * Y' = Y + S * qpitch 395 * detile(untiled, A) = (X, Y, S) 396 * where X = X' / cpp 397 * Y = Y' % qpitch 398 * S = Y' / qpitch 399 * X' = A % pitch 400 * Y' = A / pitch 401 * 402 * (In these formulas, pitch is the number of bytes occupied by a single row 403 * of samples). 404 */ 405class brw_blorp_blit_program 406{ 407public: 408 brw_blorp_blit_program(struct brw_context *brw, 409 const brw_blorp_blit_prog_key *key); 410 ~brw_blorp_blit_program(); 411 412 const GLuint *compile(struct brw_context *brw, GLuint *program_size); 413 414 brw_blorp_prog_data prog_data; 415 416private: 417 void alloc_regs(); 418 void alloc_push_const_regs(int base_reg); 419 void compute_frag_coords(); 420 void translate_tiling(bool old_tiled_w, bool new_tiled_w); 421 void encode_msaa(unsigned num_samples, intel_msaa_layout layout); 422 void decode_msaa(unsigned num_samples, intel_msaa_layout layout); 423 void kill_if_outside_dst_rect(); 424 void translate_dst_to_src(); 425 void single_to_blend(); 426 void manual_blend(); 427 void sample(struct brw_reg dst); 428 void texel_fetch(struct brw_reg dst); 429 void mcs_fetch(); 430 void expand_to_32_bits(struct brw_reg src, struct brw_reg dst); 431 void texture_lookup(struct brw_reg dst, GLuint msg_type, 432 const sampler_message_arg *args, int num_args); 433 void render_target_write(); 434 435 /** 436 * Base-2 logarithm of the maximum number of samples that can be blended. 437 */ 438 static const unsigned LOG2_MAX_BLEND_SAMPLES = 2; 439 440 void *mem_ctx; 441 struct brw_context *brw; 442 const brw_blorp_blit_prog_key *key; 443 struct brw_compile func; 444 445 /* Thread dispatch header */ 446 struct brw_reg R0; 447 448 /* Pixel X/Y coordinates (always in R1). */ 449 struct brw_reg R1; 450 451 /* Push constants */ 452 struct brw_reg dst_x0; 453 struct brw_reg dst_x1; 454 struct brw_reg dst_y0; 455 struct brw_reg dst_y1; 456 struct { 457 struct brw_reg multiplier; 458 struct brw_reg offset; 459 } x_transform, y_transform; 460 461 /* Data read from texture (4 vec16's per array element) */ 462 struct brw_reg texture_data[LOG2_MAX_BLEND_SAMPLES + 1]; 463 464 /* Auxiliary storage for the contents of the MCS surface. 465 * 466 * Since the sampler always returns 8 registers worth of data, this is 8 467 * registers wide, even though we only use the first 2 registers of it. 468 */ 469 struct brw_reg mcs_data; 470 471 /* X coordinates. We have two of them so that we can perform coordinate 472 * transformations easily. 473 */ 474 struct brw_reg x_coords[2]; 475 476 /* Y coordinates. We have two of them so that we can perform coordinate 477 * transformations easily. 478 */ 479 struct brw_reg y_coords[2]; 480 481 /* Which element of x_coords and y_coords is currently in use. 482 */ 483 int xy_coord_index; 484 485 /* True if, at the point in the program currently being compiled, the 486 * sample index is known to be zero. 487 */ 488 bool s_is_zero; 489 490 /* Register storing the sample index when s_is_zero is false. */ 491 struct brw_reg sample_index; 492 493 /* Temporaries */ 494 struct brw_reg t1; 495 struct brw_reg t2; 496 497 /* MRF used for sampling and render target writes */ 498 GLuint base_mrf; 499}; 500 501brw_blorp_blit_program::brw_blorp_blit_program( 502 struct brw_context *brw, 503 const brw_blorp_blit_prog_key *key) 504 : mem_ctx(ralloc_context(NULL)), 505 brw(brw), 506 key(key) 507{ 508 brw_init_compile(brw, &func, mem_ctx); 509} 510 511brw_blorp_blit_program::~brw_blorp_blit_program() 512{ 513 ralloc_free(mem_ctx); 514} 515 516const GLuint * 517brw_blorp_blit_program::compile(struct brw_context *brw, 518 GLuint *program_size) 519{ 520 /* Sanity checks */ 521 if (key->dst_tiled_w && key->rt_samples > 0) { 522 /* If the destination image is W tiled and multisampled, then the thread 523 * must be dispatched once per sample, not once per pixel. This is 524 * necessary because after conversion between W and Y tiling, there's no 525 * guarantee that all samples corresponding to a single pixel will still 526 * be together. 527 */ 528 assert(key->persample_msaa_dispatch); 529 } 530 531 if (key->blend) { 532 /* We are blending, which means we won't have an opportunity to 533 * translate the tiling and sample count for the texture surface. So 534 * the surface state for the texture must be configured with the correct 535 * tiling and sample count. 536 */ 537 assert(!key->src_tiled_w); 538 assert(key->tex_samples == key->src_samples); 539 assert(key->tex_layout == key->src_layout); 540 assert(key->tex_samples > 0); 541 } 542 543 if (key->persample_msaa_dispatch) { 544 /* It only makes sense to do persample dispatch if the render target is 545 * configured as multisampled. 546 */ 547 assert(key->rt_samples > 0); 548 } 549 550 /* Make sure layout is consistent with sample count */ 551 assert((key->tex_layout == INTEL_MSAA_LAYOUT_NONE) == 552 (key->tex_samples == 0)); 553 assert((key->rt_layout == INTEL_MSAA_LAYOUT_NONE) == 554 (key->rt_samples == 0)); 555 assert((key->src_layout == INTEL_MSAA_LAYOUT_NONE) == 556 (key->src_samples == 0)); 557 assert((key->dst_layout == INTEL_MSAA_LAYOUT_NONE) == 558 (key->dst_samples == 0)); 559 560 /* Set up prog_data */ 561 memset(&prog_data, 0, sizeof(prog_data)); 562 prog_data.persample_msaa_dispatch = key->persample_msaa_dispatch; 563 564 brw_set_compression_control(&func, BRW_COMPRESSION_NONE); 565 566 alloc_regs(); 567 compute_frag_coords(); 568 569 /* Render target and texture hardware don't support W tiling. */ 570 const bool rt_tiled_w = false; 571 const bool tex_tiled_w = false; 572 573 /* The address that data will be written to is determined by the 574 * coordinates supplied to the WM thread and the tiling and sample count of 575 * the render target, according to the formula: 576 * 577 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset)) 578 * 579 * If the actual tiling and sample count of the destination surface are not 580 * the same as the configuration of the render target, then these 581 * coordinates are wrong and we have to adjust them to compensate for the 582 * difference. 583 */ 584 if (rt_tiled_w != key->dst_tiled_w || 585 key->rt_samples != key->dst_samples || 586 key->rt_layout != key->dst_layout) { 587 encode_msaa(key->rt_samples, key->rt_layout); 588 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 589 translate_tiling(rt_tiled_w, key->dst_tiled_w); 590 /* Now (X, Y, S) = detile(dst_tiling, offset) */ 591 decode_msaa(key->dst_samples, key->dst_layout); 592 } 593 594 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)). 595 * 596 * That is: X, Y and S now contain the true coordinates and sample index of 597 * the data that the WM thread should output. 598 * 599 * If we need to kill pixels that are outside the destination rectangle, 600 * now is the time to do it. 601 */ 602 603 if (key->use_kill) 604 kill_if_outside_dst_rect(); 605 606 /* Next, apply a translation to obtain coordinates in the source image. */ 607 translate_dst_to_src(); 608 609 /* If the source image is not multisampled, then we want to fetch sample 610 * number 0, because that's the only sample there is. 611 */ 612 if (key->src_samples == 0) 613 s_is_zero = true; 614 615 /* X, Y, and S are now the coordinates of the pixel in the source image 616 * that we want to texture from. Exception: if we are blending, then S is 617 * irrelevant, because we are going to fetch all samples. 618 */ 619 if (key->blend) { 620 if (brw->intel.gen == 6) { 621 /* Gen6 hardware an automatically blend using the SAMPLE message */ 622 single_to_blend(); 623 sample(texture_data[0]); 624 } else { 625 /* Gen7+ hardware doesn't automaticaly blend. */ 626 manual_blend(); 627 } 628 } else { 629 /* We aren't blending, which means we just want to fetch a single sample 630 * from the source surface. The address that we want to fetch from is 631 * related to the X, Y and S values according to the formula: 632 * 633 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)). 634 * 635 * If the actual tiling and sample count of the source surface are not 636 * the same as the configuration of the texture, then we need to adjust 637 * the coordinates to compensate for the difference. 638 */ 639 if (tex_tiled_w != key->src_tiled_w || 640 key->tex_samples != key->src_samples || 641 key->tex_layout != key->src_layout) { 642 encode_msaa(key->src_samples, key->src_layout); 643 /* Now (X, Y, S) = detile(src_tiling, offset) */ 644 translate_tiling(key->src_tiled_w, tex_tiled_w); 645 /* Now (X, Y, S) = detile(tex_tiling, offset) */ 646 decode_msaa(key->tex_samples, key->tex_layout); 647 } 648 649 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)). 650 * 651 * In other words: X, Y, and S now contain values which, when passed to 652 * the texturing unit, will cause data to be read from the correct 653 * memory location. So we can fetch the texel now. 654 */ 655 if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS) 656 mcs_fetch(); 657 texel_fetch(texture_data[0]); 658 } 659 660 /* Finally, write the fetched (or blended) value to the render target and 661 * terminate the thread. 662 */ 663 render_target_write(); 664 return brw_get_program(&func, program_size); 665} 666 667void 668brw_blorp_blit_program::alloc_push_const_regs(int base_reg) 669{ 670#define CONST_LOC(name) offsetof(brw_blorp_wm_push_constants, name) 671#define ALLOC_REG(name) \ 672 this->name = \ 673 brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / 2) 674 675 ALLOC_REG(dst_x0); 676 ALLOC_REG(dst_x1); 677 ALLOC_REG(dst_y0); 678 ALLOC_REG(dst_y1); 679 ALLOC_REG(x_transform.multiplier); 680 ALLOC_REG(x_transform.offset); 681 ALLOC_REG(y_transform.multiplier); 682 ALLOC_REG(y_transform.offset); 683#undef CONST_LOC 684#undef ALLOC_REG 685} 686 687void 688brw_blorp_blit_program::alloc_regs() 689{ 690 int reg = 0; 691 this->R0 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW); 692 this->R1 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW); 693 prog_data.first_curbe_grf = reg; 694 alloc_push_const_regs(reg); 695 reg += BRW_BLORP_NUM_PUSH_CONST_REGS; 696 for (unsigned i = 0; i < ARRAY_SIZE(texture_data); ++i) { 697 this->texture_data[i] = 698 retype(vec16(brw_vec8_grf(reg, 0)), key->texture_data_type); 699 reg += 8; 700 } 701 this->mcs_data = 702 retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD); reg += 8; 703 for (int i = 0; i < 2; ++i) { 704 this->x_coords[i] 705 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW)); 706 this->y_coords[i] 707 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW)); 708 } 709 this->xy_coord_index = 0; 710 this->sample_index 711 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW)); 712 this->t1 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW)); 713 this->t2 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW)); 714 715 /* Make sure we didn't run out of registers */ 716 assert(reg <= GEN7_MRF_HACK_START); 717 718 int mrf = 2; 719 this->base_mrf = mrf; 720} 721 722/* In the code that follows, X and Y can be used to quickly refer to the 723 * active elements of x_coords and y_coords, and Xp and Yp ("X prime" and "Y 724 * prime") to the inactive elements. 725 * 726 * S can be used to quickly refer to sample_index. 727 */ 728#define X x_coords[xy_coord_index] 729#define Y y_coords[xy_coord_index] 730#define Xp x_coords[!xy_coord_index] 731#define Yp y_coords[!xy_coord_index] 732#define S sample_index 733 734/* Quickly swap the roles of (X, Y) and (Xp, Yp). Saves us from having to do 735 * MOVs to transfor (Xp, Yp) to (X, Y) after a coordinate transformation. 736 */ 737#define SWAP_XY_AND_XPYP() xy_coord_index = !xy_coord_index; 738 739/** 740 * Emit code to compute the X and Y coordinates of the pixels being rendered 741 * by this WM invocation. 742 * 743 * Assuming the render target is set up for Y tiling, these (X, Y) values are 744 * related to the address offset where outputs will be written by the formula: 745 * 746 * (X, Y, S) = decode_msaa(detile(offset)). 747 * 748 * (See brw_blorp_blit_program). 749 */ 750void 751brw_blorp_blit_program::compute_frag_coords() 752{ 753 /* R1.2[15:0] = X coordinate of upper left pixel of subspan 0 (pixel 0) 754 * R1.3[15:0] = X coordinate of upper left pixel of subspan 1 (pixel 4) 755 * R1.4[15:0] = X coordinate of upper left pixel of subspan 2 (pixel 8) 756 * R1.5[15:0] = X coordinate of upper left pixel of subspan 3 (pixel 12) 757 * 758 * Pixels within a subspan are laid out in this arrangement: 759 * 0 1 760 * 2 3 761 * 762 * So, to compute the coordinates of each pixel, we need to read every 2nd 763 * 16-bit value (vstride=2) from R1, starting at the 4th 16-bit value 764 * (suboffset=4), and duplicate each value 4 times (hstride=0, width=4). 765 * In other words, the data we want to access is R1.4<2;4,0>UW. 766 * 767 * Then, we need to add the repeating sequence (0, 1, 0, 1, ...) to the 768 * result, since pixels n+1 and n+3 are in the right half of the subspan. 769 */ 770 brw_ADD(&func, X, stride(suboffset(R1, 4), 2, 4, 0), brw_imm_v(0x10101010)); 771 772 /* Similarly, Y coordinates for subspans come from R1.2[31:16] through 773 * R1.5[31:16], so to get pixel Y coordinates we need to start at the 5th 774 * 16-bit value instead of the 4th (R1.5<2;4,0>UW instead of 775 * R1.4<2;4,0>UW). 776 * 777 * And we need to add the repeating sequence (0, 0, 1, 1, ...), since 778 * pixels n+2 and n+3 are in the bottom half of the subspan. 779 */ 780 brw_ADD(&func, Y, stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100)); 781 782 if (key->persample_msaa_dispatch) { 783 /* The WM will be run in MSDISPMODE_PERSAMPLE with num_samples > 0. 784 * Therefore, subspan 0 will represent sample 0, subspan 1 will 785 * represent sample 1, and so on. 786 * 787 * So we need to populate S with the sequence (0, 0, 0, 0, 1, 1, 1, 1, 788 * 2, 2, 2, 2, 3, 3, 3, 3). The easiest way to do this is to populate a 789 * temporary variable with the sequence (0, 1, 2, 3), and then copy from 790 * it using vstride=1, width=4, hstride=0. 791 * 792 * TODO: implement the necessary calculation for 8x multisampling. 793 */ 794 brw_MOV(&func, t1, brw_imm_v(0x3210)); 795 brw_MOV(&func, S, stride(t1, 1, 4, 0)); 796 s_is_zero = false; 797 } else { 798 /* Either the destination surface is single-sampled, or the WM will be 799 * run in MSDISPMODE_PERPIXEL (which causes a single fragment dispatch 800 * per pixel). In either case, it's not meaningful to compute a sample 801 * value. Just set it to 0. 802 */ 803 s_is_zero = true; 804 } 805} 806 807/** 808 * Emit code to compensate for the difference between Y and W tiling. 809 * 810 * This code modifies the X and Y coordinates according to the formula: 811 * 812 * (X', Y', S') = detile(new_tiling, tile(old_tiling, X, Y, S)) 813 * 814 * (See brw_blorp_blit_program). 815 * 816 * It can only translate between W and Y tiling, so new_tiling and old_tiling 817 * are booleans where true represents W tiling and false represents Y tiling. 818 */ 819void 820brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w) 821{ 822 if (old_tiled_w == new_tiled_w) 823 return; 824 825 /* In the code that follows, we can safely assume that S = 0, because W 826 * tiling formats always use IMS layout. 827 */ 828 assert(s_is_zero); 829 830 if (new_tiled_w) { 831 /* Given X and Y coordinates that describe an address using Y tiling, 832 * translate to the X and Y coordinates that describe the same address 833 * using W tiling. 834 * 835 * If we break down the low order bits of X and Y, using a 836 * single letter to represent each low-order bit: 837 * 838 * X = A << 7 | 0bBCDEFGH 839 * Y = J << 5 | 0bKLMNP (1) 840 * 841 * Then we can apply the Y tiling formula to see the memory offset being 842 * addressed: 843 * 844 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2) 845 * 846 * If we apply the W detiling formula to this memory location, that the 847 * corresponding X' and Y' coordinates are: 848 * 849 * X' = A << 6 | 0bBCDPFH (3) 850 * Y' = J << 6 | 0bKLMNEG 851 * 852 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'), 853 * we need to make the following computation: 854 * 855 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4) 856 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1 857 */ 858 brw_AND(&func, t1, X, brw_imm_uw(0xfff4)); /* X & ~0b1011 */ 859 brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1011) >> 1 */ 860 brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */ 861 brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b1) << 2 */ 862 brw_OR(&func, t1, t1, t2); /* (X & ~0b1011) >> 1 | (Y & 0b1) << 2 */ 863 brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */ 864 brw_OR(&func, Xp, t1, t2); 865 brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */ 866 brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */ 867 brw_AND(&func, t2, X, brw_imm_uw(8)); /* X & 0b1000 */ 868 brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b1000) >> 2 */ 869 brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (X & 0b1000) >> 2 */ 870 brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */ 871 brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */ 872 brw_OR(&func, Yp, t1, t2); 873 SWAP_XY_AND_XPYP(); 874 } else { 875 /* Applying the same logic as above, but in reverse, we obtain the 876 * formulas: 877 * 878 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1 879 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2 880 */ 881 brw_AND(&func, t1, X, brw_imm_uw(0xfffa)); /* X & ~0b101 */ 882 brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b101) << 1 */ 883 brw_AND(&func, t2, Y, brw_imm_uw(2)); /* Y & 0b10 */ 884 brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b10) << 2 */ 885 brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */ 886 brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */ 887 brw_SHL(&func, t2, t2, brw_imm_uw(1)); /* (Y & 0b1) << 1 */ 888 brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 889 | (Y & 0b1) << 1 */ 890 brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */ 891 brw_OR(&func, Xp, t1, t2); 892 brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */ 893 brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */ 894 brw_AND(&func, t2, X, brw_imm_uw(4)); /* X & 0b100 */ 895 brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b100) >> 2 */ 896 brw_OR(&func, Yp, t1, t2); 897 SWAP_XY_AND_XPYP(); 898 } 899} 900 901/** 902 * Emit code to compensate for the difference between MSAA and non-MSAA 903 * surfaces. 904 * 905 * This code modifies the X and Y coordinates according to the formula: 906 * 907 * (X', Y', S') = encode_msaa_4x(X, Y, S) 908 * 909 * (See brw_blorp_blit_program). 910 */ 911void 912brw_blorp_blit_program::encode_msaa(unsigned num_samples, 913 intel_msaa_layout layout) 914{ 915 switch (layout) { 916 case INTEL_MSAA_LAYOUT_NONE: 917 /* No translation necessary, and S should already be zero. */ 918 assert(s_is_zero); 919 break; 920 case INTEL_MSAA_LAYOUT_CMS: 921 /* We can't compensate for compressed layout since at this point in the 922 * program we haven't read from the MCS buffer. 923 */ 924 assert(!"Bad layout in encode_msaa"); 925 break; 926 case INTEL_MSAA_LAYOUT_UMS: 927 /* No translation necessary. */ 928 break; 929 case INTEL_MSAA_LAYOUT_IMS: 930 /* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 931 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 932 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1) 933 */ 934 brw_AND(&func, t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */ 935 if (!s_is_zero) { 936 brw_AND(&func, t2, S, brw_imm_uw(1)); /* S & 0b1 */ 937 brw_OR(&func, t1, t1, t2); /* (X & ~0b1) | (S & 0b1) */ 938 } 939 brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1) << 1 940 | (S & 0b1) << 1 */ 941 brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */ 942 brw_OR(&func, Xp, t1, t2); 943 brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */ 944 brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */ 945 if (!s_is_zero) { 946 brw_AND(&func, t2, S, brw_imm_uw(2)); /* S & 0b10 */ 947 brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (S & 0b10) */ 948 } 949 brw_AND(&func, t2, Y, brw_imm_uw(1)); 950 brw_OR(&func, Yp, t1, t2); 951 SWAP_XY_AND_XPYP(); 952 s_is_zero = true; 953 break; 954 } 955} 956 957/** 958 * Emit code to compensate for the difference between MSAA and non-MSAA 959 * surfaces. 960 * 961 * This code modifies the X and Y coordinates according to the formula: 962 * 963 * (X', Y', S) = decode_msaa(num_samples, X, Y, S) 964 * 965 * (See brw_blorp_blit_program). 966 */ 967void 968brw_blorp_blit_program::decode_msaa(unsigned num_samples, 969 intel_msaa_layout layout) 970{ 971 switch (layout) { 972 case INTEL_MSAA_LAYOUT_NONE: 973 /* No translation necessary, and S should already be zero. */ 974 assert(s_is_zero); 975 break; 976 case INTEL_MSAA_LAYOUT_CMS: 977 /* We can't compensate for compressed layout since at this point in the 978 * program we don't have access to the MCS buffer. 979 */ 980 assert(!"Bad layout in encode_msaa"); 981 break; 982 case INTEL_MSAA_LAYOUT_UMS: 983 /* No translation necessary. */ 984 break; 985 case INTEL_MSAA_LAYOUT_IMS: 986 /* decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 987 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 988 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 989 * S = (Y & 0b10) | (X & 0b10) >> 1 990 */ 991 assert(s_is_zero); 992 brw_AND(&func, t1, X, brw_imm_uw(0xfffc)); /* X & ~0b11 */ 993 brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b11) >> 1 */ 994 brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */ 995 brw_OR(&func, Xp, t1, t2); 996 brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */ 997 brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */ 998 brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */ 999 brw_OR(&func, Yp, t1, t2); 1000 brw_AND(&func, t1, Y, brw_imm_uw(2)); /* Y & 0b10 */ 1001 brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */ 1002 brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */ 1003 brw_OR(&func, S, t1, t2); 1004 s_is_zero = false; 1005 SWAP_XY_AND_XPYP(); 1006 break; 1007 } 1008} 1009 1010/** 1011 * Emit code that kills pixels whose X and Y coordinates are outside the 1012 * boundary of the rectangle defined by the push constants (dst_x0, dst_y0, 1013 * dst_x1, dst_y1). 1014 */ 1015void 1016brw_blorp_blit_program::kill_if_outside_dst_rect() 1017{ 1018 struct brw_reg f0 = brw_flag_reg(); 1019 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 1020 struct brw_reg null16 = vec16(retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); 1021 1022 brw_CMP(&func, null16, BRW_CONDITIONAL_GE, X, dst_x0); 1023 brw_CMP(&func, null16, BRW_CONDITIONAL_GE, Y, dst_y0); 1024 brw_CMP(&func, null16, BRW_CONDITIONAL_L, X, dst_x1); 1025 brw_CMP(&func, null16, BRW_CONDITIONAL_L, Y, dst_y1); 1026 1027 brw_set_predicate_control(&func, BRW_PREDICATE_NONE); 1028 brw_push_insn_state(&func); 1029 brw_set_mask_control(&func, BRW_MASK_DISABLE); 1030 brw_AND(&func, g1, f0, g1); 1031 brw_pop_insn_state(&func); 1032} 1033 1034/** 1035 * Emit code to translate from destination (X, Y) coordinates to source (X, Y) 1036 * coordinates. 1037 */ 1038void 1039brw_blorp_blit_program::translate_dst_to_src() 1040{ 1041 brw_MUL(&func, Xp, X, x_transform.multiplier); 1042 brw_MUL(&func, Yp, Y, y_transform.multiplier); 1043 brw_ADD(&func, Xp, Xp, x_transform.offset); 1044 brw_ADD(&func, Yp, Yp, y_transform.offset); 1045 SWAP_XY_AND_XPYP(); 1046} 1047 1048/** 1049 * Emit code to transform the X and Y coordinates as needed for blending 1050 * together the different samples in an MSAA texture. 1051 */ 1052void 1053brw_blorp_blit_program::single_to_blend() 1054{ 1055 /* When looking up samples in an MSAA texture using the SAMPLE message, 1056 * Gen6 requires the texture coordinates to be odd integers (so that they 1057 * correspond to the center of a 2x2 block representing the four samples 1058 * that maxe up a pixel). So we need to multiply our X and Y coordinates 1059 * each by 2 and then add 1. 1060 */ 1061 brw_SHL(&func, t1, X, brw_imm_w(1)); 1062 brw_SHL(&func, t2, Y, brw_imm_w(1)); 1063 brw_ADD(&func, Xp, t1, brw_imm_w(1)); 1064 brw_ADD(&func, Yp, t2, brw_imm_w(1)); 1065 SWAP_XY_AND_XPYP(); 1066} 1067 1068 1069/** 1070 * Count the number of trailing 1 bits in the given value. For example: 1071 * 1072 * count_trailing_one_bits(0) == 0 1073 * count_trailing_one_bits(7) == 3 1074 * count_trailing_one_bits(11) == 2 1075 */ 1076inline int count_trailing_one_bits(unsigned value) 1077{ 1078#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) /* gcc 3.4 or later */ 1079 return __builtin_ctz(~value); 1080#else 1081 return _mesa_bitcount(value & ~(value + 1)); 1082#endif 1083} 1084 1085 1086void 1087brw_blorp_blit_program::manual_blend() 1088{ 1089 /* TODO: support num_samples != 4 */ 1090 const int num_samples = 4; 1091 1092 if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS) 1093 mcs_fetch(); 1094 1095 /* We add together samples using a binary tree structure, e.g. for 4x MSAA: 1096 * 1097 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4 1098 * 1099 * This ensures that when all samples have the same value, no numerical 1100 * precision is lost, since each addition operation always adds two equal 1101 * values, and summing two equal floating point values does not lose 1102 * precision. 1103 * 1104 * We perform this computation by treating the texture_data array as a 1105 * stack and performing the following operations: 1106 * 1107 * - push sample 0 onto stack 1108 * - push sample 1 onto stack 1109 * - add top two stack entries 1110 * - push sample 2 onto stack 1111 * - push sample 3 onto stack 1112 * - add top two stack entries 1113 * - add top two stack entries 1114 * - divide top stack entry by 4 1115 * 1116 * Note that after pushing sample i onto the stack, the number of add 1117 * operations we do is equal to the number of trailing 1 bits in i. This 1118 * works provided the total number of samples is a power of two, which it 1119 * always is for i965. 1120 * 1121 * For integer formats, we replace the add operations with average 1122 * operations and skip the final division. 1123 */ 1124 typedef struct brw_instruction *(*brw_op2_ptr)(struct brw_compile *, 1125 struct brw_reg, 1126 struct brw_reg, 1127 struct brw_reg); 1128 brw_op2_ptr combine_op = 1129 key->texture_data_type == BRW_REGISTER_TYPE_F ? brw_ADD : brw_AVG; 1130 unsigned stack_depth = 0; 1131 for (int i = 0; i < num_samples; ++i) { 1132 assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */ 1133 1134 /* Push sample i onto the stack */ 1135 assert(stack_depth < ARRAY_SIZE(texture_data)); 1136 if (i == 0) { 1137 s_is_zero = true; 1138 } else { 1139 s_is_zero = false; 1140 brw_MOV(&func, S, brw_imm_uw(i)); 1141 } 1142 texel_fetch(texture_data[stack_depth++]); 1143 1144 if (i == 0 && key->tex_layout == INTEL_MSAA_LAYOUT_CMS) { 1145 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface) 1146 * suggests an optimization: 1147 * 1148 * "A simple optimization with probable large return in 1149 * performance is to compare the MCS value to zero (indicating 1150 * all samples are on sample slice 0), and sample only from 1151 * sample slice 0 using ld2dss if MCS is zero." 1152 * 1153 * Note that in the case where the MCS value is zero, sampling from 1154 * sample slice 0 using ld2dss and sampling from sample 0 using 1155 * ld2dms are equivalent (since all samples are on sample slice 0). 1156 * Since we have already sampled from sample 0, all we need to do is 1157 * skip the remaining fetches and averaging if MCS is zero. 1158 */ 1159 brw_CMP(&func, vec16(brw_null_reg()), BRW_CONDITIONAL_NZ, 1160 mcs_data, brw_imm_ud(0)); 1161 brw_IF(&func, BRW_EXECUTE_16); 1162 } 1163 1164 /* Do count_trailing_one_bits(i) times */ 1165 for (int j = count_trailing_one_bits(i); j-- > 0; ) { 1166 assert(stack_depth >= 2); 1167 --stack_depth; 1168 1169 /* TODO: should use a smaller loop bound for non_RGBA formats */ 1170 for (int k = 0; k < 4; ++k) { 1171 combine_op(&func, offset(texture_data[stack_depth - 1], 2*k), 1172 offset(vec8(texture_data[stack_depth - 1]), 2*k), 1173 offset(vec8(texture_data[stack_depth]), 2*k)); 1174 } 1175 } 1176 } 1177 1178 /* We should have just 1 sample on the stack now. */ 1179 assert(stack_depth == 1); 1180 1181 if (key->texture_data_type == BRW_REGISTER_TYPE_F) { 1182 /* Scale the result down by a factor of num_samples */ 1183 /* TODO: should use a smaller loop bound for non-RGBA formats */ 1184 for (int j = 0; j < 4; ++j) { 1185 brw_MUL(&func, offset(texture_data[0], 2*j), 1186 offset(vec8(texture_data[0]), 2*j), 1187 brw_imm_f(1.0/num_samples)); 1188 } 1189 } 1190 1191 if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS) 1192 brw_ENDIF(&func); 1193} 1194 1195/** 1196 * Emit code to look up a value in the texture using the SAMPLE message (which 1197 * does blending of MSAA surfaces). 1198 */ 1199void 1200brw_blorp_blit_program::sample(struct brw_reg dst) 1201{ 1202 static const sampler_message_arg args[2] = { 1203 SAMPLER_MESSAGE_ARG_U_FLOAT, 1204 SAMPLER_MESSAGE_ARG_V_FLOAT 1205 }; 1206 1207 texture_lookup(dst, GEN5_SAMPLER_MESSAGE_SAMPLE, args, ARRAY_SIZE(args)); 1208} 1209 1210/** 1211 * Emit code to look up a value in the texture using the SAMPLE_LD message 1212 * (which does a simple texel fetch). 1213 */ 1214void 1215brw_blorp_blit_program::texel_fetch(struct brw_reg dst) 1216{ 1217 static const sampler_message_arg gen6_args[5] = { 1218 SAMPLER_MESSAGE_ARG_U_INT, 1219 SAMPLER_MESSAGE_ARG_V_INT, 1220 SAMPLER_MESSAGE_ARG_ZERO_INT, /* R */ 1221 SAMPLER_MESSAGE_ARG_ZERO_INT, /* LOD */ 1222 SAMPLER_MESSAGE_ARG_SI_INT 1223 }; 1224 static const sampler_message_arg gen7_ld_args[3] = { 1225 SAMPLER_MESSAGE_ARG_U_INT, 1226 SAMPLER_MESSAGE_ARG_ZERO_INT, /* LOD */ 1227 SAMPLER_MESSAGE_ARG_V_INT 1228 }; 1229 static const sampler_message_arg gen7_ld2dss_args[3] = { 1230 SAMPLER_MESSAGE_ARG_SI_INT, 1231 SAMPLER_MESSAGE_ARG_U_INT, 1232 SAMPLER_MESSAGE_ARG_V_INT 1233 }; 1234 static const sampler_message_arg gen7_ld2dms_args[4] = { 1235 SAMPLER_MESSAGE_ARG_SI_INT, 1236 SAMPLER_MESSAGE_ARG_MCS_INT, 1237 SAMPLER_MESSAGE_ARG_U_INT, 1238 SAMPLER_MESSAGE_ARG_V_INT 1239 }; 1240 1241 switch (brw->intel.gen) { 1242 case 6: 1243 texture_lookup(dst, GEN5_SAMPLER_MESSAGE_SAMPLE_LD, gen6_args, 1244 s_is_zero ? 2 : 5); 1245 break; 1246 case 7: 1247 switch (key->tex_layout) { 1248 case INTEL_MSAA_LAYOUT_IMS: 1249 /* From the Ivy Bridge PRM, Vol4 Part1 p72 (Multisampled Surface Storage 1250 * Format): 1251 * 1252 * If this field is MSFMT_DEPTH_STENCIL 1253 * [a.k.a. INTEL_MSAA_LAYOUT_IMS], the only sampling engine 1254 * messages allowed are "ld2dms", "resinfo", and "sampleinfo". 1255 * 1256 * So fall through to emit the same message as we use for 1257 * INTEL_MSAA_LAYOUT_CMS. 1258 */ 1259 case INTEL_MSAA_LAYOUT_CMS: 1260 texture_lookup(dst, GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS, 1261 gen7_ld2dms_args, ARRAY_SIZE(gen7_ld2dms_args)); 1262 break; 1263 case INTEL_MSAA_LAYOUT_UMS: 1264 texture_lookup(dst, GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS, 1265 gen7_ld2dss_args, ARRAY_SIZE(gen7_ld2dss_args)); 1266 break; 1267 case INTEL_MSAA_LAYOUT_NONE: 1268 assert(s_is_zero); 1269 texture_lookup(dst, GEN5_SAMPLER_MESSAGE_SAMPLE_LD, gen7_ld_args, 1270 ARRAY_SIZE(gen7_ld_args)); 1271 break; 1272 } 1273 break; 1274 default: 1275 assert(!"Should not get here."); 1276 break; 1277 }; 1278} 1279 1280void 1281brw_blorp_blit_program::mcs_fetch() 1282{ 1283 static const sampler_message_arg gen7_ld_mcs_args[2] = { 1284 SAMPLER_MESSAGE_ARG_U_INT, 1285 SAMPLER_MESSAGE_ARG_V_INT 1286 }; 1287 texture_lookup(vec16(mcs_data), GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS, 1288 gen7_ld_mcs_args, ARRAY_SIZE(gen7_ld_mcs_args)); 1289} 1290 1291void 1292brw_blorp_blit_program::expand_to_32_bits(struct brw_reg src, 1293 struct brw_reg dst) 1294{ 1295 brw_MOV(&func, vec8(dst), vec8(src)); 1296 brw_set_compression_control(&func, BRW_COMPRESSION_2NDHALF); 1297 brw_MOV(&func, offset(vec8(dst), 1), suboffset(vec8(src), 8)); 1298 brw_set_compression_control(&func, BRW_COMPRESSION_NONE); 1299} 1300 1301void 1302brw_blorp_blit_program::texture_lookup(struct brw_reg dst, 1303 GLuint msg_type, 1304 const sampler_message_arg *args, 1305 int num_args) 1306{ 1307 struct brw_reg mrf = 1308 retype(vec16(brw_message_reg(base_mrf)), BRW_REGISTER_TYPE_UD); 1309 for (int arg = 0; arg < num_args; ++arg) { 1310 switch (args[arg]) { 1311 case SAMPLER_MESSAGE_ARG_U_FLOAT: 1312 expand_to_32_bits(X, retype(mrf, BRW_REGISTER_TYPE_F)); 1313 break; 1314 case SAMPLER_MESSAGE_ARG_V_FLOAT: 1315 expand_to_32_bits(Y, retype(mrf, BRW_REGISTER_TYPE_F)); 1316 break; 1317 case SAMPLER_MESSAGE_ARG_U_INT: 1318 expand_to_32_bits(X, mrf); 1319 break; 1320 case SAMPLER_MESSAGE_ARG_V_INT: 1321 expand_to_32_bits(Y, mrf); 1322 break; 1323 case SAMPLER_MESSAGE_ARG_SI_INT: 1324 /* Note: on Gen7, this code may be reached with s_is_zero==true 1325 * because in Gen7's ld2dss message, the sample index is the first 1326 * argument. When this happens, we need to move a 0 into the 1327 * appropriate message register. 1328 */ 1329 if (s_is_zero) 1330 brw_MOV(&func, mrf, brw_imm_ud(0)); 1331 else 1332 expand_to_32_bits(S, mrf); 1333 break; 1334 case SAMPLER_MESSAGE_ARG_MCS_INT: 1335 switch (key->tex_layout) { 1336 case INTEL_MSAA_LAYOUT_CMS: 1337 brw_MOV(&func, mrf, mcs_data); 1338 break; 1339 case INTEL_MSAA_LAYOUT_IMS: 1340 /* When sampling from an IMS surface, MCS data is not relevant, 1341 * and the hardware ignores it. So don't bother populating it. 1342 */ 1343 break; 1344 default: 1345 /* We shouldn't be trying to send MCS data with any other 1346 * layouts. 1347 */ 1348 assert (!"Unsupported layout for MCS data"); 1349 break; 1350 } 1351 break; 1352 case SAMPLER_MESSAGE_ARG_ZERO_INT: 1353 brw_MOV(&func, mrf, brw_imm_ud(0)); 1354 break; 1355 } 1356 mrf.nr += 2; 1357 } 1358 1359 brw_SAMPLE(&func, 1360 retype(dst, BRW_REGISTER_TYPE_UW) /* dest */, 1361 base_mrf /* msg_reg_nr */, 1362 brw_message_reg(base_mrf) /* src0 */, 1363 BRW_BLORP_TEXTURE_BINDING_TABLE_INDEX, 1364 0 /* sampler */, 1365 WRITEMASK_XYZW, 1366 msg_type, 1367 8 /* response_length. TODO: should be smaller for non-RGBA formats? */, 1368 mrf.nr - base_mrf /* msg_length */, 1369 0 /* header_present */, 1370 BRW_SAMPLER_SIMD_MODE_SIMD16, 1371 BRW_SAMPLER_RETURN_FORMAT_FLOAT32); 1372} 1373 1374#undef X 1375#undef Y 1376#undef U 1377#undef V 1378#undef S 1379#undef SWAP_XY_AND_XPYP 1380 1381void 1382brw_blorp_blit_program::render_target_write() 1383{ 1384 struct brw_reg mrf_rt_write = 1385 retype(vec16(brw_message_reg(base_mrf)), key->texture_data_type); 1386 int mrf_offset = 0; 1387 1388 /* If we may have killed pixels, then we need to send R0 and R1 in a header 1389 * so that the render target knows which pixels we killed. 1390 */ 1391 bool use_header = key->use_kill; 1392 if (use_header) { 1393 /* Copy R0/1 to MRF */ 1394 brw_MOV(&func, retype(mrf_rt_write, BRW_REGISTER_TYPE_UD), 1395 retype(R0, BRW_REGISTER_TYPE_UD)); 1396 mrf_offset += 2; 1397 } 1398 1399 /* Copy texture data to MRFs */ 1400 for (int i = 0; i < 4; ++i) { 1401 /* E.g. mov(16) m2.0<1>:f r2.0<8;8,1>:f { Align1, H1 } */ 1402 brw_MOV(&func, offset(mrf_rt_write, mrf_offset), 1403 offset(vec8(texture_data[0]), 2*i)); 1404 mrf_offset += 2; 1405 } 1406 1407 /* Now write to the render target and terminate the thread */ 1408 brw_fb_WRITE(&func, 1409 16 /* dispatch_width */, 1410 base_mrf /* msg_reg_nr */, 1411 mrf_rt_write /* src0 */, 1412 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, 1413 BRW_BLORP_RENDERBUFFER_BINDING_TABLE_INDEX, 1414 mrf_offset /* msg_length. TODO: Should be smaller for non-RGBA formats. */, 1415 0 /* response_length */, 1416 true /* eot */, 1417 use_header); 1418} 1419 1420 1421void 1422brw_blorp_coord_transform_params::setup(GLuint src0, GLuint dst0, GLuint dst1, 1423 bool mirror) 1424{ 1425 if (!mirror) { 1426 /* When not mirroring a coordinate (say, X), we need: 1427 * x' - src_x0 = x - dst_x0 1428 * Therefore: 1429 * x' = 1*x + (src_x0 - dst_x0) 1430 */ 1431 multiplier = 1; 1432 offset = src0 - dst0; 1433 } else { 1434 /* When mirroring X we need: 1435 * x' - src_x0 = dst_x1 - x - 1 1436 * Therefore: 1437 * x' = -1*x + (src_x0 + dst_x1 - 1) 1438 */ 1439 multiplier = -1; 1440 offset = src0 + dst1 - 1; 1441 } 1442} 1443 1444 1445/** 1446 * Determine which MSAA layout the GPU pipeline should be configured for, 1447 * based on the chip generation, the number of samples, and the true layout of 1448 * the image in memory. 1449 */ 1450inline intel_msaa_layout 1451compute_msaa_layout_for_pipeline(struct brw_context *brw, unsigned num_samples, 1452 intel_msaa_layout true_layout) 1453{ 1454 if (num_samples == 0) { 1455 /* When configuring the GPU for non-MSAA, we can still accommodate IMS 1456 * format buffers, by transforming coordinates appropriately. 1457 */ 1458 assert(true_layout == INTEL_MSAA_LAYOUT_NONE || 1459 true_layout == INTEL_MSAA_LAYOUT_IMS); 1460 return INTEL_MSAA_LAYOUT_NONE; 1461 } else { 1462 assert(true_layout != INTEL_MSAA_LAYOUT_NONE); 1463 } 1464 1465 /* Prior to Gen7, all MSAA surfaces use IMS layout. */ 1466 if (brw->intel.gen == 6) { 1467 assert(true_layout == INTEL_MSAA_LAYOUT_IMS); 1468 } 1469 1470 return true_layout; 1471} 1472 1473 1474brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw, 1475 struct intel_mipmap_tree *src_mt, 1476 struct intel_mipmap_tree *dst_mt, 1477 GLuint src_x0, GLuint src_y0, 1478 GLuint dst_x0, GLuint dst_y0, 1479 GLuint dst_x1, GLuint dst_y1, 1480 bool mirror_x, bool mirror_y) 1481{ 1482 src.set(brw, src_mt, 0, 0); 1483 dst.set(brw, dst_mt, 0, 0); 1484 1485 use_wm_prog = true; 1486 memset(&wm_prog_key, 0, sizeof(wm_prog_key)); 1487 1488 /* texture_data_type indicates the register type that should be used to 1489 * manipulate texture data. 1490 */ 1491 switch (_mesa_get_format_datatype(src_mt->format)) { 1492 case GL_UNSIGNED_NORMALIZED: 1493 case GL_SIGNED_NORMALIZED: 1494 case GL_FLOAT: 1495 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F; 1496 break; 1497 case GL_UNSIGNED_INT: 1498 if (src_mt->format == MESA_FORMAT_S8) { 1499 /* We process stencil as though it's an unsigned normalized color */ 1500 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F; 1501 } else { 1502 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_UD; 1503 } 1504 break; 1505 case GL_INT: 1506 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_D; 1507 break; 1508 default: 1509 assert(!"Unrecognized blorp format"); 1510 break; 1511 } 1512 1513 if (brw->intel.gen > 6) { 1514 /* Gen7's rendering hardware only supports the IMS layout for depth and 1515 * stencil render targets. Blorp always maps its destination surface as 1516 * a color render target (even if it's actually a depth or stencil 1517 * buffer). So if the destination is IMS, we'll have to map it as a 1518 * single-sampled texture and interleave the samples ourselves. 1519 */ 1520 if (dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS) 1521 dst.num_samples = 0; 1522 } 1523 1524 if (dst.map_stencil_as_y_tiled && dst.num_samples > 0) { 1525 /* If the destination surface is a W-tiled multisampled stencil buffer 1526 * that we're mapping as Y tiled, then we need to arrange for the WM 1527 * program to run once per sample rather than once per pixel, because 1528 * the memory layout of related samples doesn't match between W and Y 1529 * tiling. 1530 */ 1531 wm_prog_key.persample_msaa_dispatch = true; 1532 } 1533 1534 if (src.num_samples > 0 && dst.num_samples > 0) { 1535 /* We are blitting from a multisample buffer to a multisample buffer, so 1536 * we must preserve samples within a pixel. This means we have to 1537 * arrange for the WM program to run once per sample rather than once 1538 * per pixel. 1539 */ 1540 wm_prog_key.persample_msaa_dispatch = true; 1541 } 1542 1543 /* The render path must be configured to use the same number of samples as 1544 * the destination buffer. 1545 */ 1546 num_samples = dst.num_samples; 1547 1548 GLenum base_format = _mesa_get_format_base_format(src_mt->format); 1549 if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */ 1550 base_format != GL_STENCIL_INDEX && 1551 src_mt->num_samples > 0 && dst_mt->num_samples == 0) { 1552 /* We are downsampling a color buffer, so blend. */ 1553 wm_prog_key.blend = true; 1554 } 1555 1556 /* src_samples and dst_samples are the true sample counts */ 1557 wm_prog_key.src_samples = src_mt->num_samples; 1558 wm_prog_key.dst_samples = dst_mt->num_samples; 1559 1560 /* tex_samples and rt_samples are the sample counts that are set up in 1561 * SURFACE_STATE. 1562 */ 1563 wm_prog_key.tex_samples = src.num_samples; 1564 wm_prog_key.rt_samples = dst.num_samples; 1565 1566 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will 1567 * use to access the source and destination surfaces. 1568 */ 1569 wm_prog_key.tex_layout = 1570 compute_msaa_layout_for_pipeline(brw, src.num_samples, src.msaa_layout); 1571 wm_prog_key.rt_layout = 1572 compute_msaa_layout_for_pipeline(brw, dst.num_samples, dst.msaa_layout); 1573 1574 /* src_layout and dst_layout indicate the true MSAA layout used by src and 1575 * dst. 1576 */ 1577 wm_prog_key.src_layout = src_mt->msaa_layout; 1578 wm_prog_key.dst_layout = dst_mt->msaa_layout; 1579 1580 wm_prog_key.src_tiled_w = src.map_stencil_as_y_tiled; 1581 wm_prog_key.dst_tiled_w = dst.map_stencil_as_y_tiled; 1582 x0 = wm_push_consts.dst_x0 = dst_x0; 1583 y0 = wm_push_consts.dst_y0 = dst_y0; 1584 x1 = wm_push_consts.dst_x1 = dst_x1; 1585 y1 = wm_push_consts.dst_y1 = dst_y1; 1586 wm_push_consts.x_transform.setup(src_x0, dst_x0, dst_x1, mirror_x); 1587 wm_push_consts.y_transform.setup(src_y0, dst_y0, dst_y1, mirror_y); 1588 1589 if (dst.num_samples == 0 && dst_mt->num_samples > 0) { 1590 /* We must expand the rectangle we send through the rendering pipeline, 1591 * to account for the fact that we are mapping the destination region as 1592 * single-sampled when it is in fact multisampled. We must also align 1593 * it to a multiple of the multisampling pattern, because the 1594 * differences between multisampled and single-sampled surface formats 1595 * will mean that pixels are scrambled within the multisampling pattern. 1596 * TODO: what if this makes the coordinates too large? 1597 * 1598 * Note: this only works if the destination surface uses the IMS layout. 1599 * If it's UMS, then we have no choice but to set up the rendering 1600 * pipeline as multisampled. 1601 */ 1602 assert(dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS); 1603 x0 = (x0 * 2) & ~3; 1604 y0 = (y0 * 2) & ~3; 1605 x1 = ALIGN(x1 * 2, 4); 1606 y1 = ALIGN(y1 * 2, 4); 1607 wm_prog_key.use_kill = true; 1608 } 1609 1610 if (dst.map_stencil_as_y_tiled) { 1611 /* We must modify the rectangle we send through the rendering pipeline, 1612 * to account for the fact that we are mapping it as Y-tiled when it is 1613 * in fact W-tiled. Y tiles have dimensions 128x32 whereas W tiles have 1614 * dimensions 64x64. We must also align it to a multiple of the tile 1615 * size, because the differences between W and Y tiling formats will 1616 * mean that pixels are scrambled within the tile. 1617 * 1618 * Note: if the destination surface configured to use IMS layout, then 1619 * the effective tile size we need to align it to is smaller, because 1620 * each pixel covers a 2x2 or a 4x2 block of samples. 1621 * 1622 * TODO: what if this makes the coordinates too large? 1623 */ 1624 unsigned x_align = 64, y_align = 64; 1625 if (dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS) { 1626 x_align /= (dst_mt->num_samples == 4 ? 2 : 4); 1627 y_align /= 2; 1628 } 1629 x0 = (x0 & ~(x_align - 1)) * 2; 1630 y0 = (y0 & ~(y_align - 1)) / 2; 1631 x1 = ALIGN(x1, x_align) * 2; 1632 y1 = ALIGN(y1, y_align) / 2; 1633 wm_prog_key.use_kill = true; 1634 } 1635} 1636 1637uint32_t 1638brw_blorp_blit_params::get_wm_prog(struct brw_context *brw, 1639 brw_blorp_prog_data **prog_data) const 1640{ 1641 uint32_t prog_offset; 1642 if (!brw_search_cache(&brw->cache, BRW_BLORP_BLIT_PROG, 1643 &this->wm_prog_key, sizeof(this->wm_prog_key), 1644 &prog_offset, prog_data)) { 1645 brw_blorp_blit_program prog(brw, &this->wm_prog_key); 1646 GLuint program_size; 1647 const GLuint *program = prog.compile(brw, &program_size); 1648 brw_upload_cache(&brw->cache, BRW_BLORP_BLIT_PROG, 1649 &this->wm_prog_key, sizeof(this->wm_prog_key), 1650 program, program_size, 1651 &prog.prog_data, sizeof(prog.prog_data), 1652 &prog_offset, prog_data); 1653 } 1654 return prog_offset; 1655} 1656