lp_bld_format_aos.c revision 4634cb5921b985f04f2daf00cda2d28036143bd3
1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28/** 29 * @file 30 * AoS pixel format manipulation. 31 * 32 * @author Jose Fonseca <jfonseca@vmware.com> 33 */ 34 35 36#include "util/u_format.h" 37#include "util/u_memory.h" 38#include "util/u_math.h" 39#include "util/u_pointer.h" 40#include "util/u_string.h" 41#include "util/u_cpu_detect.h" 42 43#include "lp_bld_arit.h" 44#include "lp_bld_init.h" 45#include "lp_bld_type.h" 46#include "lp_bld_flow.h" 47#include "lp_bld_const.h" 48#include "lp_bld_conv.h" 49#include "lp_bld_swizzle.h" 50#include "lp_bld_gather.h" 51#include "lp_bld_debug.h" 52#include "lp_bld_format.h" 53#include "lp_bld_pack.h" 54#include "lp_bld_intr.h" 55#include "lp_bld_logic.h" 56#include "lp_bld_bitarit.h" 57 58 59/** 60 * Basic swizzling. Rearrange the order of the unswizzled array elements 61 * according to the format description. PIPE_SWIZZLE_0/ONE are supported 62 * too. 63 * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}. 64 */ 65LLVMValueRef 66lp_build_format_swizzle_aos(const struct util_format_description *desc, 67 struct lp_build_context *bld, 68 LLVMValueRef unswizzled) 69{ 70 unsigned char swizzles[4]; 71 unsigned chan; 72 73 assert(bld->type.length % 4 == 0); 74 75 for (chan = 0; chan < 4; ++chan) { 76 enum pipe_swizzle swizzle; 77 78 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { 79 /* 80 * For ZS formats do RGBA = ZZZ1 81 */ 82 if (chan == 3) { 83 swizzle = PIPE_SWIZZLE_1; 84 } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) { 85 swizzle = PIPE_SWIZZLE_0; 86 } else { 87 swizzle = desc->swizzle[0]; 88 } 89 } else { 90 swizzle = desc->swizzle[chan]; 91 } 92 swizzles[chan] = swizzle; 93 } 94 95 return lp_build_swizzle_aos(bld, unswizzled, swizzles); 96} 97 98 99/** 100 * Whether the format matches the vector type, apart of swizzles. 101 */ 102static inline boolean 103format_matches_type(const struct util_format_description *desc, 104 struct lp_type type) 105{ 106 enum util_format_type chan_type; 107 unsigned chan; 108 109 assert(type.length % 4 == 0); 110 111 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || 112 desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || 113 desc->block.width != 1 || 114 desc->block.height != 1) { 115 return FALSE; 116 } 117 118 if (type.floating) { 119 chan_type = UTIL_FORMAT_TYPE_FLOAT; 120 } else if (type.fixed) { 121 chan_type = UTIL_FORMAT_TYPE_FIXED; 122 } else if (type.sign) { 123 chan_type = UTIL_FORMAT_TYPE_SIGNED; 124 } else { 125 chan_type = UTIL_FORMAT_TYPE_UNSIGNED; 126 } 127 128 for (chan = 0; chan < desc->nr_channels; ++chan) { 129 if (desc->channel[chan].size != type.width) { 130 return FALSE; 131 } 132 133 if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) { 134 if (desc->channel[chan].type != chan_type || 135 desc->channel[chan].normalized != type.norm) { 136 return FALSE; 137 } 138 } 139 } 140 141 return TRUE; 142} 143 144/* 145 * Do rounding when converting small unorm values to larger ones. 146 * Not quite 100% accurate, as it's done by appending MSBs, but 147 * should be good enough. 148 */ 149 150static inline LLVMValueRef 151scale_bits_up(struct gallivm_state *gallivm, 152 int src_bits, 153 int dst_bits, 154 LLVMValueRef src, 155 struct lp_type src_type) 156{ 157 LLVMBuilderRef builder = gallivm->builder; 158 LLVMValueRef result = src; 159 160 if (src_bits == 1 && dst_bits > 1) { 161 /* 162 * Useful for a1 - we'd need quite some repeated copies otherwise. 163 */ 164 struct lp_build_context bld; 165 LLVMValueRef dst_mask; 166 lp_build_context_init(&bld, gallivm, src_type); 167 dst_mask = lp_build_const_int_vec(gallivm, src_type, 168 (1 << dst_bits) - 1), 169 result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src, 170 lp_build_const_int_vec(gallivm, src_type, 0)); 171 result = lp_build_andnot(&bld, dst_mask, result); 172 } 173 else if (dst_bits > src_bits) { 174 /* Scale up bits */ 175 int db = dst_bits - src_bits; 176 177 /* Shift left by difference in bits */ 178 result = LLVMBuildShl(builder, 179 src, 180 lp_build_const_int_vec(gallivm, src_type, db), 181 ""); 182 183 if (db <= src_bits) { 184 /* Enough bits in src to fill the remainder */ 185 LLVMValueRef lower = LLVMBuildLShr(builder, 186 src, 187 lp_build_const_int_vec(gallivm, src_type, 188 src_bits - db), 189 ""); 190 191 result = LLVMBuildOr(builder, result, lower, ""); 192 } else if (db > src_bits) { 193 /* Need to repeatedly copy src bits to fill remainder in dst */ 194 unsigned n; 195 196 for (n = src_bits; n < dst_bits; n *= 2) { 197 LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); 198 199 result = LLVMBuildOr(builder, 200 result, 201 LLVMBuildLShr(builder, result, shuv, ""), 202 ""); 203 } 204 } 205 } else { 206 assert (dst_bits == src_bits); 207 } 208 209 return result; 210} 211 212/** 213 * Unpack a single pixel into its XYZW components. 214 * 215 * @param desc the pixel format for the packed pixel value 216 * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM 217 * 218 * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector. 219 */ 220static inline LLVMValueRef 221lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, 222 const struct util_format_description *desc, 223 LLVMValueRef packed) 224{ 225 LLVMBuilderRef builder = gallivm->builder; 226 LLVMValueRef shifted, casted, scaled, masked; 227 LLVMValueRef shifts[4]; 228 LLVMValueRef masks[4]; 229 LLVMValueRef scales[4]; 230 LLVMTypeRef vec32_type; 231 232 boolean normalized; 233 boolean needs_uitofp; 234 unsigned i; 235 236 /* TODO: Support more formats */ 237 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); 238 assert(desc->block.width == 1); 239 assert(desc->block.height == 1); 240 assert(desc->block.bits <= 32); 241 242 /* Do the intermediate integer computations with 32bit integers since it 243 * matches floating point size */ 244 assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context)); 245 246 vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); 247 248 /* Broadcast the packed value to all four channels 249 * before: packed = BGRA 250 * after: packed = {BGRA, BGRA, BGRA, BGRA} 251 */ 252 packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed, 253 LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)), 254 ""); 255 packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type), 256 LLVMConstNull(vec32_type), 257 ""); 258 259 /* Initialize vector constants */ 260 normalized = FALSE; 261 needs_uitofp = FALSE; 262 263 /* Loop over 4 color components */ 264 for (i = 0; i < 4; ++i) { 265 unsigned bits = desc->channel[i].size; 266 unsigned shift = desc->channel[i].shift; 267 268 if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { 269 shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 270 masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)); 271 scales[i] = LLVMConstNull(LLVMFloatTypeInContext(gallivm->context)); 272 } 273 else { 274 unsigned long long mask = (1ULL << bits) - 1; 275 276 assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED); 277 278 if (bits == 32) { 279 needs_uitofp = TRUE; 280 } 281 282 shifts[i] = lp_build_const_int32(gallivm, shift); 283 masks[i] = lp_build_const_int32(gallivm, mask); 284 285 if (desc->channel[i].normalized) { 286 scales[i] = lp_build_const_float(gallivm, 1.0 / mask); 287 normalized = TRUE; 288 } 289 else 290 scales[i] = lp_build_const_float(gallivm, 1.0); 291 } 292 } 293 294 /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} 295 * into masked = {X, Y, Z, W} 296 */ 297 if (desc->block.bits < 32 && normalized) { 298 /* 299 * Note: we cannot do the shift below on x86 natively until AVX2. 300 * 301 * Old llvm versions will resort to scalar extract/shift insert, 302 * which is definitely terrible, new versions will just do 303 * several vector shifts and shuffle/blend results together. 304 * We could turn this into a variable left shift plus a constant 305 * right shift, and llvm would then turn the variable left shift 306 * into a mul for us (albeit without sse41 the mul needs emulation 307 * too...). However, since we're going to do a float mul 308 * anyway, we just adjust that mul instead (plus the mask), skipping 309 * the shift completely. 310 * We could also use a extra mul when the format isn't normalized and 311 * we don't have AVX2 support, but don't bother for now. Unfortunately, 312 * this strategy doesn't work for 32bit formats (such as rgb10a2 or even 313 * rgba8 if it ends up here), as that would require UIToFP, albeit that 314 * would be fixable with easy 16bit shuffle (unless there's channels 315 * crossing 16bit boundaries). 316 */ 317 for (i = 0; i < 4; ++i) { 318 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { 319 unsigned bits = desc->channel[i].size; 320 unsigned shift = desc->channel[i].shift; 321 unsigned long long mask = ((1ULL << bits) - 1) << shift; 322 scales[i] = lp_build_const_float(gallivm, 1.0 / mask); 323 masks[i] = lp_build_const_int32(gallivm, mask); 324 } 325 } 326 masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), ""); 327 } else { 328 shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); 329 masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); 330 } 331 332 if (!needs_uitofp) { 333 /* UIToFP can't be expressed in SSE2 */ 334 casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); 335 } else { 336 casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); 337 } 338 339 /* 340 * At this point 'casted' may be a vector of floats such as 341 * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied 342 * by powers of two). Next, if the pixel values are normalized 343 * we'll scale this to {1.0, 1.0, 1.0, 1.0}. 344 */ 345 346 if (normalized) 347 scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), ""); 348 else 349 scaled = casted; 350 351 return scaled; 352} 353 354 355/** 356 * Pack a single pixel. 357 * 358 * @param rgba 4 float vector with the unpacked components. 359 * 360 * XXX: This is mostly for reference and testing -- operating a single pixel at 361 * a time is rarely if ever needed. 362 */ 363LLVMValueRef 364lp_build_pack_rgba_aos(struct gallivm_state *gallivm, 365 const struct util_format_description *desc, 366 LLVMValueRef rgba) 367{ 368 LLVMBuilderRef builder = gallivm->builder; 369 LLVMTypeRef type; 370 LLVMValueRef packed = NULL; 371 LLVMValueRef swizzles[4]; 372 LLVMValueRef shifted, casted, scaled, unswizzled; 373 LLVMValueRef shifts[4]; 374 LLVMValueRef scales[4]; 375 boolean normalized; 376 unsigned i, j; 377 378 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); 379 assert(desc->block.width == 1); 380 assert(desc->block.height == 1); 381 382 type = LLVMIntTypeInContext(gallivm->context, desc->block.bits); 383 384 /* Unswizzle the color components into the source vector. */ 385 for (i = 0; i < 4; ++i) { 386 for (j = 0; j < 4; ++j) { 387 if (desc->swizzle[j] == i) 388 break; 389 } 390 if (j < 4) 391 swizzles[i] = lp_build_const_int32(gallivm, j); 392 else 393 swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 394 } 395 396 unswizzled = LLVMBuildShuffleVector(builder, rgba, 397 LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)), 398 LLVMConstVector(swizzles, 4), ""); 399 400 normalized = FALSE; 401 for (i = 0; i < 4; ++i) { 402 unsigned bits = desc->channel[i].size; 403 unsigned shift = desc->channel[i].shift; 404 405 if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { 406 shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 407 scales[i] = LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context)); 408 } 409 else { 410 unsigned mask = (1 << bits) - 1; 411 412 assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED); 413 assert(bits < 32); 414 415 shifts[i] = lp_build_const_int32(gallivm, shift); 416 417 if (desc->channel[i].normalized) { 418 scales[i] = lp_build_const_float(gallivm, mask); 419 normalized = TRUE; 420 } 421 else 422 scales[i] = lp_build_const_float(gallivm, 1.0); 423 } 424 } 425 426 if (normalized) 427 scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); 428 else 429 scaled = unswizzled; 430 431 casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), ""); 432 433 shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), ""); 434 435 /* Bitwise or all components */ 436 for (i = 0; i < 4; ++i) { 437 if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { 438 LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, 439 lp_build_const_int32(gallivm, i), ""); 440 if (packed) 441 packed = LLVMBuildOr(builder, packed, component, ""); 442 else 443 packed = component; 444 } 445 } 446 447 if (!packed) 448 packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 449 450 if (desc->block.bits < 32) 451 packed = LLVMBuildTrunc(builder, packed, type, ""); 452 453 return packed; 454} 455 456 457 458 459/** 460 * Fetch a pixel into a 4 float AoS. 461 * 462 * \param format_desc describes format of the image we're fetching from 463 * \param aligned whether the data is guaranteed to be aligned 464 * \param ptr address of the pixel block (or the texel if uncompressed) 465 * \param i, j the sub-block pixel coordinates. For non-compressed formats 466 * these will always be (0, 0). 467 * \return a 4 element vector with the pixel's RGBA values. 468 */ 469LLVMValueRef 470lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, 471 const struct util_format_description *format_desc, 472 struct lp_type type, 473 boolean aligned, 474 LLVMValueRef base_ptr, 475 LLVMValueRef offset, 476 LLVMValueRef i, 477 LLVMValueRef j, 478 LLVMValueRef cache) 479{ 480 LLVMBuilderRef builder = gallivm->builder; 481 unsigned num_pixels = type.length / 4; 482 struct lp_build_context bld; 483 484 assert(type.length <= LP_MAX_VECTOR_LENGTH); 485 assert(type.length % 4 == 0); 486 487 lp_build_context_init(&bld, gallivm, type); 488 489 /* 490 * Trivial case 491 * 492 * The format matches the type (apart of a swizzle) so no need for 493 * scaling or converting. 494 */ 495 496 if (format_matches_type(format_desc, type) && 497 format_desc->block.bits <= type.width * 4 && 498 /* XXX this shouldn't be needed */ 499 util_is_power_of_two(format_desc->block.bits)) { 500 LLVMValueRef packed; 501 LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); 502 struct lp_type fetch_type; 503 unsigned vec_len = type.width * type.length; 504 505 /* 506 * The format matches the type (apart of a swizzle) so no need for 507 * scaling or converting. 508 */ 509 510 fetch_type = lp_type_uint(type.width*4); 511 packed = lp_build_gather(gallivm, type.length/4, 512 format_desc->block.bits, fetch_type, 513 aligned, base_ptr, offset, TRUE); 514 515 assert(format_desc->block.bits <= vec_len); 516 (void) vec_len; /* silence unused var warning for non-debug build */ 517 518 packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, ""); 519 return lp_build_format_swizzle_aos(format_desc, &bld, packed); 520 } 521 522 /* 523 * Bit arithmetic for converting small_unorm to unorm8. 524 * 525 * This misses some opportunities for optimizations (like skipping mask 526 * for the highest channel for instance, or doing bit scaling in parallel 527 * for channels with the same bit width) but it should be passable for 528 * all arithmetic formats. 529 */ 530 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && 531 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && 532 util_format_fits_8unorm(format_desc) && 533 type.width == 8 && type.norm == 1 && type.sign == 0 && 534 type.fixed == 0 && type.floating == 0) { 535 LLVMValueRef packed, res, chans[4], rgba[4]; 536 LLVMTypeRef dst_vec_type, conv_vec_type; 537 struct lp_type fetch_type, conv_type; 538 struct lp_build_context bld_conv; 539 unsigned j; 540 541 fetch_type = lp_type_uint(type.width*4); 542 conv_type = lp_type_int_vec(type.width*4, type.width * type.length); 543 dst_vec_type = lp_build_vec_type(gallivm, type); 544 conv_vec_type = lp_build_vec_type(gallivm, conv_type); 545 lp_build_context_init(&bld, gallivm, conv_type); 546 547 packed = lp_build_gather(gallivm, type.length/4, 548 format_desc->block.bits, fetch_type, 549 aligned, base_ptr, offset, TRUE); 550 551 assert(format_desc->block.bits * type.length / 4 <= 552 type.width * type.length); 553 554 packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, ""); 555 556 for (j = 0; j < format_desc->nr_channels; ++j) { 557 unsigned mask = 0; 558 unsigned sa = format_desc->channel[j].shift; 559 560 mask = (1 << format_desc->channel[j].size) - 1; 561 562 /* Extract bits from source */ 563 chans[j] = LLVMBuildLShr(builder, packed, 564 lp_build_const_int_vec(gallivm, conv_type, sa), 565 ""); 566 567 chans[j] = LLVMBuildAnd(builder, chans[j], 568 lp_build_const_int_vec(gallivm, conv_type, mask), 569 ""); 570 571 /* Scale bits */ 572 if (type.norm) { 573 chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size, 574 type.width, chans[j], conv_type); 575 } 576 } 577 /* 578 * This is a hacked lp_build_format_swizzle_soa() since we need a 579 * normalized 1 but only 8 bits in a 32bit vector... 580 */ 581 for (j = 0; j < 4; ++j) { 582 enum pipe_swizzle swizzle = format_desc->swizzle[j]; 583 if (swizzle == PIPE_SWIZZLE_1) { 584 rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1); 585 } else { 586 rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle); 587 } 588 if (j == 0) { 589 res = rgba[j]; 590 } else { 591 rgba[j] = LLVMBuildShl(builder, rgba[j], 592 lp_build_const_int_vec(gallivm, conv_type, 593 j * type.width), ""); 594 res = LLVMBuildOr(builder, res, rgba[j], ""); 595 } 596 } 597 res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, ""); 598 599 return res; 600 } 601 602 /* 603 * Bit arithmetic 604 */ 605 606 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && 607 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || 608 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && 609 format_desc->block.width == 1 && 610 format_desc->block.height == 1 && 611 /* XXX this shouldn't be needed */ 612 util_is_power_of_two(format_desc->block.bits) && 613 format_desc->block.bits <= 32 && 614 format_desc->is_bitmask && 615 !format_desc->is_mixed && 616 (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || 617 format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) && 618 !format_desc->channel[0].pure_integer) { 619 620 LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; 621 LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128]; 622 struct lp_type conv_type; 623 unsigned k, num_conv_src, num_conv_dst; 624 625 /* 626 * Note this path is generally terrible for fetching multiple pixels. 627 * We should make sure we cannot hit this code path for anything but 628 * single pixels. 629 */ 630 631 /* 632 * Unpack a pixel at a time into a <4 x float> RGBA vector 633 */ 634 635 for (k = 0; k < num_pixels; ++k) { 636 LLVMValueRef packed; 637 638 packed = lp_build_gather_elem(gallivm, num_pixels, 639 format_desc->block.bits, 32, aligned, 640 base_ptr, offset, k, FALSE); 641 642 tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm, 643 format_desc, 644 packed); 645 } 646 647 /* 648 * Type conversion. 649 * 650 * TODO: We could avoid floating conversion for integer to 651 * integer conversions. 652 */ 653 654 if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) { 655 debug_printf("%s: unpacking %s with floating point\n", 656 __FUNCTION__, format_desc->short_name); 657 } 658 659 conv_type = lp_float32_vec4_type(); 660 num_conv_src = num_pixels; 661 num_conv_dst = 1; 662 663 if (num_pixels % 8 == 0) { 664 lp_build_concat_n(gallivm, lp_float32_vec4_type(), 665 tmps, num_pixels, tmps, num_pixels / 2); 666 conv_type.length *= num_pixels / 4; 667 num_conv_src = 4 * num_pixels / 8; 668 if (type.width == 8 && type.floating == 0 && type.fixed == 0) { 669 /* 670 * FIXME: The fast float->unorm path (which is basically 671 * skipping the MIN/MAX which are extremely pointless in any 672 * case) requires that there's 2 destinations... 673 * In any case, we really should make sure we don't hit this 674 * code with multiple pixels for unorm8 dst types, it's 675 * completely hopeless even if we do hit the right conversion. 676 */ 677 type.length /= num_pixels / 4; 678 num_conv_dst = num_pixels / 4; 679 } 680 } 681 682 lp_build_conv(gallivm, conv_type, type, 683 tmps, num_conv_src, res, num_conv_dst); 684 685 if (num_pixels % 8 == 0 && 686 (type.width == 8 && type.floating == 0 && type.fixed == 0)) { 687 lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1); 688 } 689 690 return lp_build_format_swizzle_aos(format_desc, &bld, res[0]); 691 } 692 693 /* If all channels are of same type and we are not using half-floats */ 694 if (format_desc->is_array && 695 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) { 696 assert(!format_desc->is_mixed); 697 return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset); 698 } 699 700 /* 701 * YUV / subsampled formats 702 */ 703 704 if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { 705 struct lp_type tmp_type; 706 LLVMValueRef tmp; 707 708 memset(&tmp_type, 0, sizeof tmp_type); 709 tmp_type.width = 8; 710 tmp_type.length = num_pixels * 4; 711 tmp_type.norm = TRUE; 712 713 tmp = lp_build_fetch_subsampled_rgba_aos(gallivm, 714 format_desc, 715 num_pixels, 716 base_ptr, 717 offset, 718 i, j); 719 720 lp_build_conv(gallivm, 721 tmp_type, type, 722 &tmp, 1, &tmp, 1); 723 724 return tmp; 725 } 726 727 /* 728 * s3tc rgb formats 729 */ 730 731 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) { 732 struct lp_type tmp_type; 733 LLVMValueRef tmp; 734 735 memset(&tmp_type, 0, sizeof tmp_type); 736 tmp_type.width = 8; 737 tmp_type.length = num_pixels * 4; 738 tmp_type.norm = TRUE; 739 740 tmp = lp_build_fetch_cached_texels(gallivm, 741 format_desc, 742 num_pixels, 743 base_ptr, 744 offset, 745 i, j, 746 cache); 747 748 lp_build_conv(gallivm, 749 tmp_type, type, 750 &tmp, 1, &tmp, 1); 751 752 return tmp; 753 } 754 755 /* 756 * Fallback to util_format_description::fetch_rgba_8unorm(). 757 */ 758 759 if (format_desc->fetch_rgba_8unorm && 760 !type.floating && type.width == 8 && !type.sign && type.norm) { 761 /* 762 * Fallback to calling util_format_description::fetch_rgba_8unorm. 763 * 764 * This is definitely not the most efficient way of fetching pixels, as 765 * we miss the opportunity to do vectorization, but this it is a 766 * convenient for formats or scenarios for which there was no opportunity 767 * or incentive to optimize. 768 */ 769 770 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); 771 LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); 772 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 773 LLVMValueRef function; 774 LLVMValueRef tmp_ptr; 775 LLVMValueRef tmp; 776 LLVMValueRef res; 777 unsigned k; 778 779 if (gallivm_debug & GALLIVM_DEBUG_PERF) { 780 debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n", 781 __FUNCTION__, format_desc->short_name); 782 } 783 784 /* 785 * Declare and bind format_desc->fetch_rgba_8unorm(). 786 */ 787 788 { 789 /* 790 * Function to call looks like: 791 * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) 792 */ 793 LLVMTypeRef ret_type; 794 LLVMTypeRef arg_types[4]; 795 LLVMTypeRef function_type; 796 797 ret_type = LLVMVoidTypeInContext(gallivm->context); 798 arg_types[0] = pi8t; 799 arg_types[1] = pi8t; 800 arg_types[2] = i32t; 801 arg_types[3] = i32t; 802 function_type = LLVMFunctionType(ret_type, arg_types, 803 ARRAY_SIZE(arg_types), 0); 804 805 /* make const pointer for the C fetch_rgba_8unorm function */ 806 function = lp_build_const_int_pointer(gallivm, 807 func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); 808 809 /* cast the callee pointer to the function's type */ 810 function = LLVMBuildBitCast(builder, function, 811 LLVMPointerType(function_type, 0), 812 "cast callee"); 813 } 814 815 tmp_ptr = lp_build_alloca(gallivm, i32t, ""); 816 817 res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); 818 819 /* 820 * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result 821 * in the SoA vectors. 822 */ 823 824 for (k = 0; k < num_pixels; ++k) { 825 LLVMValueRef index = lp_build_const_int32(gallivm, k); 826 LLVMValueRef args[4]; 827 828 args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); 829 args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, 830 base_ptr, offset, k); 831 832 if (num_pixels == 1) { 833 args[2] = i; 834 args[3] = j; 835 } 836 else { 837 args[2] = LLVMBuildExtractElement(builder, i, index, ""); 838 args[3] = LLVMBuildExtractElement(builder, j, index, ""); 839 } 840 841 LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); 842 843 tmp = LLVMBuildLoad(builder, tmp_ptr, ""); 844 845 if (num_pixels == 1) { 846 res = tmp; 847 } 848 else { 849 res = LLVMBuildInsertElement(builder, res, tmp, index, ""); 850 } 851 } 852 853 /* Bitcast from <n x i32> to <4n x i8> */ 854 res = LLVMBuildBitCast(builder, res, bld.vec_type, ""); 855 856 return res; 857 } 858 859 /* 860 * Fallback to util_format_description::fetch_rgba_float(). 861 */ 862 863 if (format_desc->fetch_rgba_float) { 864 /* 865 * Fallback to calling util_format_description::fetch_rgba_float. 866 * 867 * This is definitely not the most efficient way of fetching pixels, as 868 * we miss the opportunity to do vectorization, but this it is a 869 * convenient for formats or scenarios for which there was no opportunity 870 * or incentive to optimize. 871 */ 872 873 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); 874 LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); 875 LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); 876 LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); 877 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 878 LLVMValueRef function; 879 LLVMValueRef tmp_ptr; 880 LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; 881 LLVMValueRef res; 882 unsigned k; 883 884 if (gallivm_debug & GALLIVM_DEBUG_PERF) { 885 debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n", 886 __FUNCTION__, format_desc->short_name); 887 } 888 889 /* 890 * Declare and bind format_desc->fetch_rgba_float(). 891 */ 892 893 { 894 /* 895 * Function to call looks like: 896 * fetch(float *dst, const uint8_t *src, unsigned i, unsigned j) 897 */ 898 LLVMTypeRef ret_type; 899 LLVMTypeRef arg_types[4]; 900 901 ret_type = LLVMVoidTypeInContext(gallivm->context); 902 arg_types[0] = pf32t; 903 arg_types[1] = pi8t; 904 arg_types[2] = i32t; 905 arg_types[3] = i32t; 906 907 function = lp_build_const_func_pointer(gallivm, 908 func_to_pointer((func_pointer) format_desc->fetch_rgba_float), 909 ret_type, 910 arg_types, ARRAY_SIZE(arg_types), 911 format_desc->short_name); 912 } 913 914 tmp_ptr = lp_build_alloca(gallivm, f32x4t, ""); 915 916 /* 917 * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result 918 * in the SoA vectors. 919 */ 920 921 for (k = 0; k < num_pixels; ++k) { 922 LLVMValueRef args[4]; 923 924 args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); 925 args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, 926 base_ptr, offset, k); 927 928 if (num_pixels == 1) { 929 args[2] = i; 930 args[3] = j; 931 } 932 else { 933 LLVMValueRef index = lp_build_const_int32(gallivm, k); 934 args[2] = LLVMBuildExtractElement(builder, i, index, ""); 935 args[3] = LLVMBuildExtractElement(builder, j, index, ""); 936 } 937 938 LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); 939 940 tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); 941 } 942 943 lp_build_conv(gallivm, 944 lp_float32_vec4_type(), 945 type, 946 tmps, num_pixels, &res, 1); 947 948 return res; 949 } 950 951 assert(!util_format_is_pure_integer(format_desc->format)); 952 953 assert(0); 954 return lp_build_undef(gallivm, type); 955} 956