lp_bld_conv.c revision fdeb0394cbc737cefa36c6bf99cbd255d8899a9f
1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper functions for type conversions. 32 * 33 * We want to use the fastest type for a given computation whenever feasible. 34 * The other side of this is that we need to be able convert between several 35 * types accurately and efficiently. 36 * 37 * Conversion between types of different bit width is quite complex since a 38 * 39 * To remember there are a few invariants in type conversions: 40 * 41 * - register width must remain constant: 42 * 43 * src_type.width * src_type.length == dst_type.width * dst_type.length 44 * 45 * - total number of elements must remain constant: 46 * 47 * src_type.length * num_srcs == dst_type.length * num_dsts 48 * 49 * It is not always possible to do the conversion both accurately and 50 * efficiently, usually due to lack of adequate machine instructions. In these 51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as 52 * there this functions can be used anywhere. In the future we might have a 53 * precision parameter which can gauge the accuracy vs efficiency compromise, 54 * but for now if the data conversion between two stages happens to be the 55 * bottleneck, then most likely should just avoid converting at all and run 56 * both stages with the same type. 57 * 58 * Make sure to run lp_test_conv unit test after any change to this file. 59 * 60 * @author Jose Fonseca <jfonseca@vmware.com> 61 */ 62 63 64#include "util/u_debug.h" 65#include "util/u_math.h" 66#include "util/u_cpu_detect.h" 67 68#include "lp_bld_type.h" 69#include "lp_bld_const.h" 70#include "lp_bld_arit.h" 71#include "lp_bld_pack.h" 72#include "lp_bld_conv.h" 73 74 75/** 76 * Special case for converting clamped IEEE-754 floats to unsigned norms. 77 * 78 * The mathematical voodoo below may seem excessive but it is actually 79 * paramount we do it this way for several reasons. First, there is no single 80 * precision FP to unsigned integer conversion Intel SSE instruction. Second, 81 * secondly, even if there was, since the FP's mantissa takes only a fraction 82 * of register bits the typically scale and cast approach would require double 83 * precision for accurate results, and therefore half the throughput 84 * 85 * Although the result values can be scaled to an arbitrary bit width specified 86 * by dst_width, the actual result type will have the same width. 87 * 88 * Ex: src = { float, float, float, float } 89 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. 90 */ 91LLVMValueRef 92lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, 93 struct lp_type src_type, 94 unsigned dst_width, 95 LLVMValueRef src) 96{ 97 LLVMBuilderRef builder = gallivm->builder; 98 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); 99 LLVMValueRef res; 100 unsigned mantissa; 101 102 assert(src_type.floating); 103 assert(dst_width <= src_type.width); 104 src_type.sign = FALSE; 105 106 mantissa = lp_mantissa(src_type); 107 108 if (dst_width <= mantissa) { 109 /* 110 * Apply magic coefficients that will make the desired result to appear 111 * in the lowest significant bits of the mantissa, with correct rounding. 112 * 113 * This only works if the destination width fits in the mantissa. 114 */ 115 116 unsigned long long ubound; 117 unsigned long long mask; 118 double scale; 119 double bias; 120 121 ubound = (1ULL << dst_width); 122 mask = ubound - 1; 123 scale = (double)mask/ubound; 124 bias = (double)(1ULL << (mantissa - dst_width)); 125 126 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); 127 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); 128 res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 129 res = LLVMBuildAnd(builder, res, 130 lp_build_const_int_vec(gallivm, src_type, mask), ""); 131 } 132 else if (dst_width == (mantissa + 1)) { 133 /* 134 * The destination width matches exactly what can be represented in 135 * floating point (i.e., mantissa + 1 bits). So do a straight 136 * multiplication followed by casting. No further rounding is necessary. 137 */ 138 139 double scale; 140 141 scale = (double)((1ULL << dst_width) - 1); 142 143 res = LLVMBuildFMul(builder, src, 144 lp_build_const_vec(gallivm, src_type, scale), ""); 145 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 146 } 147 else { 148 /* 149 * The destination exceeds what can be represented in the floating point. 150 * So multiply by the largest power two we get away with, and when 151 * subtract the most significant bit to rescale to normalized values. 152 * 153 * The largest power of two factor we can get away is 154 * (1 << (src_type.width - 1)), because we need to use signed . In theory it 155 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states 156 * INT_MIN should be returned in FPToSI, which is the correct result for 157 * values near 1.0! 158 * 159 * This means we get (src_type.width - 1) correct bits for values near 0.0, 160 * and (mantissa + 1) correct bits for values near 1.0. Equally or more 161 * important, we also get exact results for 0.0 and 1.0. 162 */ 163 164 unsigned n = MIN2(src_type.width - 1, dst_width); 165 166 double scale = (double)(1ULL << n); 167 unsigned lshift = dst_width - n; 168 unsigned rshift = n; 169 LLVMValueRef lshifted; 170 LLVMValueRef rshifted; 171 172 res = LLVMBuildFMul(builder, src, 173 lp_build_const_vec(gallivm, src_type, scale), ""); 174 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 175 176 /* 177 * Align the most significant bit to its final place. 178 * 179 * This will cause 1.0 to overflow to 0, but the later adjustment will 180 * get it right. 181 */ 182 if (lshift) { 183 lshifted = LLVMBuildShl(builder, res, 184 lp_build_const_int_vec(gallivm, src_type, 185 lshift), ""); 186 } else { 187 lshifted = res; 188 } 189 190 /* 191 * Align the most significant bit to the right. 192 */ 193 rshifted = LLVMBuildLShr(builder, res, 194 lp_build_const_int_vec(gallivm, src_type, rshift), 195 ""); 196 197 /* 198 * Subtract the MSB to the LSB, therefore re-scaling from 199 * (1 << dst_width) to ((1 << dst_width) - 1). 200 */ 201 202 res = LLVMBuildSub(builder, lshifted, rshifted, ""); 203 } 204 205 return res; 206} 207 208 209/** 210 * Inverse of lp_build_clamped_float_to_unsigned_norm above. 211 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] 212 * return {float, float, float, float} with values in range [0, 1]. 213 */ 214LLVMValueRef 215lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, 216 unsigned src_width, 217 struct lp_type dst_type, 218 LLVMValueRef src) 219{ 220 LLVMBuilderRef builder = gallivm->builder; 221 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); 222 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); 223 LLVMValueRef bias_; 224 LLVMValueRef res; 225 unsigned mantissa; 226 unsigned n; 227 unsigned long long ubound; 228 unsigned long long mask; 229 double scale; 230 double bias; 231 232 assert(dst_type.floating); 233 234 mantissa = lp_mantissa(dst_type); 235 236 if (src_width <= (mantissa + 1)) { 237 /* 238 * The source width matches fits what can be represented in floating 239 * point (i.e., mantissa + 1 bits). So do a straight multiplication 240 * followed by casting. No further rounding is necessary. 241 */ 242 243 scale = 1.0/(double)((1ULL << src_width) - 1); 244 res = LLVMBuildSIToFP(builder, src, vec_type, ""); 245 res = LLVMBuildFMul(builder, res, 246 lp_build_const_vec(gallivm, dst_type, scale), ""); 247 return res; 248 } 249 else { 250 /* 251 * The source width exceeds what can be represented in floating 252 * point. So truncate the incoming values. 253 */ 254 255 n = MIN2(mantissa, src_width); 256 257 ubound = ((unsigned long long)1 << n); 258 mask = ubound - 1; 259 scale = (double)ubound/mask; 260 bias = (double)((unsigned long long)1 << (mantissa - n)); 261 262 res = src; 263 264 if (src_width > mantissa) { 265 int shift = src_width - mantissa; 266 res = LLVMBuildLShr(builder, res, 267 lp_build_const_int_vec(gallivm, dst_type, shift), ""); 268 } 269 270 bias_ = lp_build_const_vec(gallivm, dst_type, bias); 271 272 res = LLVMBuildOr(builder, 273 res, 274 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); 275 276 res = LLVMBuildBitCast(builder, res, vec_type, ""); 277 278 res = LLVMBuildFSub(builder, res, bias_, ""); 279 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); 280 } 281 282 return res; 283} 284 285 286/** 287 * Generic type conversion. 288 * 289 * TODO: Take a precision argument, or even better, add a new precision member 290 * to the lp_type union. 291 */ 292void 293lp_build_conv(struct gallivm_state *gallivm, 294 struct lp_type src_type, 295 struct lp_type dst_type, 296 const LLVMValueRef *src, unsigned num_srcs, 297 LLVMValueRef *dst, unsigned num_dsts) 298{ 299 LLVMBuilderRef builder = gallivm->builder; 300 struct lp_type tmp_type; 301 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 302 unsigned num_tmps; 303 unsigned i; 304 305 /* We must not loose or gain channels. Only precision */ 306 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 307 308 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 309 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 310 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 311 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 312 313 tmp_type = src_type; 314 for(i = 0; i < num_srcs; ++i) { 315 assert(lp_check_value(src_type, src[i])); 316 tmp[i] = src[i]; 317 } 318 num_tmps = num_srcs; 319 320 321 /* Special case 4x4f --> 1x16ub 322 */ 323 if (src_type.floating == 1 && 324 src_type.fixed == 0 && 325 src_type.sign == 1 && 326 src_type.norm == 0 && 327 src_type.width == 32 && 328 src_type.length == 4 && 329 330 dst_type.floating == 0 && 331 dst_type.fixed == 0 && 332 dst_type.sign == 0 && 333 dst_type.norm == 1 && 334 dst_type.width == 8 && 335 dst_type.length == 16 && 336 337 util_cpu_caps.has_sse2) 338 { 339 int i; 340 341 for (i = 0; i < num_dsts; i++, src += 4) { 342 struct lp_type int16_type = dst_type; 343 struct lp_type int32_type = dst_type; 344 LLVMValueRef lo, hi; 345 LLVMValueRef src_int0; 346 LLVMValueRef src_int1; 347 LLVMValueRef src_int2; 348 LLVMValueRef src_int3; 349 LLVMTypeRef int32_vec_type; 350 LLVMTypeRef src_vec_type; 351 LLVMValueRef const_255f; 352 LLVMValueRef a, b, c, d; 353 354 int16_type.width *= 2; 355 int16_type.length /= 2; 356 int16_type.sign = 1; 357 358 int32_type.width *= 4; 359 int32_type.length /= 4; 360 int32_type.sign = 1; 361 362 src_vec_type = lp_build_vec_type(gallivm, src_type); 363 int32_vec_type = lp_build_vec_type(gallivm, int32_type); 364 365 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); 366 367 a = LLVMBuildFMul(builder, src[0], const_255f, ""); 368 b = LLVMBuildFMul(builder, src[1], const_255f, ""); 369 c = LLVMBuildFMul(builder, src[2], const_255f, ""); 370 d = LLVMBuildFMul(builder, src[3], const_255f, ""); 371 372 { 373 struct lp_build_context bld; 374 375 bld.gallivm = gallivm; 376 bld.type = src_type; 377 bld.vec_type = src_vec_type; 378 bld.int_elem_type = lp_build_elem_type(gallivm, int32_type); 379 bld.int_vec_type = int32_vec_type; 380 bld.undef = lp_build_undef(gallivm, src_type); 381 bld.zero = lp_build_zero(gallivm, src_type); 382 bld.one = lp_build_one(gallivm, src_type); 383 384 src_int0 = lp_build_iround(&bld, a); 385 src_int1 = lp_build_iround(&bld, b); 386 src_int2 = lp_build_iround(&bld, c); 387 src_int3 = lp_build_iround(&bld, d); 388 } 389 /* relying on clamping behavior of sse2 intrinsics here */ 390 lo = lp_build_pack2(gallivm, int32_type, int16_type, src_int0, src_int1); 391 hi = lp_build_pack2(gallivm, int32_type, int16_type, src_int2, src_int3); 392 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); 393 } 394 return; 395 } 396 397 /* 398 * Clamp if necessary 399 */ 400 401 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { 402 struct lp_build_context bld; 403 double src_min = lp_const_min(src_type); 404 double dst_min = lp_const_min(dst_type); 405 double src_max = lp_const_max(src_type); 406 double dst_max = lp_const_max(dst_type); 407 LLVMValueRef thres; 408 409 lp_build_context_init(&bld, gallivm, tmp_type); 410 411 if(src_min < dst_min) { 412 if(dst_min == 0.0) 413 thres = bld.zero; 414 else 415 thres = lp_build_const_vec(gallivm, src_type, dst_min); 416 for(i = 0; i < num_tmps; ++i) 417 tmp[i] = lp_build_max(&bld, tmp[i], thres); 418 } 419 420 if(src_max > dst_max) { 421 if(dst_max == 1.0) 422 thres = bld.one; 423 else 424 thres = lp_build_const_vec(gallivm, src_type, dst_max); 425 for(i = 0; i < num_tmps; ++i) 426 tmp[i] = lp_build_min(&bld, tmp[i], thres); 427 } 428 } 429 430 /* 431 * Scale to the narrowest range 432 */ 433 434 if(dst_type.floating) { 435 /* Nothing to do */ 436 } 437 else if(tmp_type.floating) { 438 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { 439 for(i = 0; i < num_tmps; ++i) { 440 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, 441 tmp_type, 442 dst_type.width, 443 tmp[i]); 444 } 445 tmp_type.floating = FALSE; 446 } 447 else { 448 double dst_scale = lp_const_scale(dst_type); 449 LLVMTypeRef tmp_vec_type; 450 451 if (dst_scale != 1.0) { 452 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); 453 for(i = 0; i < num_tmps; ++i) 454 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 455 } 456 457 /* Use an equally sized integer for intermediate computations */ 458 tmp_type.floating = FALSE; 459 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 460 for(i = 0; i < num_tmps; ++i) { 461#if 0 462 if(dst_type.sign) 463 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 464 else 465 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); 466#else 467 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ 468 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 469#endif 470 } 471 } 472 } 473 else { 474 unsigned src_shift = lp_const_shift(src_type); 475 unsigned dst_shift = lp_const_shift(dst_type); 476 unsigned src_offset = lp_const_offset(src_type); 477 unsigned dst_offset = lp_const_offset(dst_type); 478 479 /* Compensate for different offsets */ 480 if (dst_offset > src_offset && src_type.width > dst_type.width) { 481 for (i = 0; i < num_tmps; ++i) { 482 LLVMValueRef shifted; 483 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1); 484 if(src_type.sign) 485 shifted = LLVMBuildAShr(builder, tmp[i], shift, ""); 486 else 487 shifted = LLVMBuildLShr(builder, tmp[i], shift, ""); 488 489 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); 490 } 491 } 492 493 if(src_shift > dst_shift) { 494 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 495 src_shift - dst_shift); 496 for(i = 0; i < num_tmps; ++i) 497 if(src_type.sign) 498 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); 499 else 500 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); 501 } 502 } 503 504 /* 505 * Truncate or expand bit width 506 * 507 * No data conversion should happen here, although the sign bits are 508 * crucial to avoid bad clamping. 509 */ 510 511 { 512 struct lp_type new_type; 513 514 new_type = tmp_type; 515 new_type.sign = dst_type.sign; 516 new_type.width = dst_type.width; 517 new_type.length = dst_type.length; 518 519 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); 520 521 tmp_type = new_type; 522 num_tmps = num_dsts; 523 } 524 525 /* 526 * Scale to the widest range 527 */ 528 529 if(src_type.floating) { 530 /* Nothing to do */ 531 } 532 else if(!src_type.floating && dst_type.floating) { 533 if(!src_type.fixed && !src_type.sign && src_type.norm) { 534 for(i = 0; i < num_tmps; ++i) { 535 tmp[i] = lp_build_unsigned_norm_to_float(gallivm, 536 src_type.width, 537 dst_type, 538 tmp[i]); 539 } 540 tmp_type.floating = TRUE; 541 } 542 else { 543 double src_scale = lp_const_scale(src_type); 544 LLVMTypeRef tmp_vec_type; 545 546 /* Use an equally sized integer for intermediate computations */ 547 tmp_type.floating = TRUE; 548 tmp_type.sign = TRUE; 549 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 550 for(i = 0; i < num_tmps; ++i) { 551#if 0 552 if(dst_type.sign) 553 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 554 else 555 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); 556#else 557 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ 558 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 559#endif 560 } 561 562 if (src_scale != 1.0) { 563 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); 564 for(i = 0; i < num_tmps; ++i) 565 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 566 } 567 } 568 } 569 else { 570 unsigned src_shift = lp_const_shift(src_type); 571 unsigned dst_shift = lp_const_shift(dst_type); 572 unsigned src_offset = lp_const_offset(src_type); 573 unsigned dst_offset = lp_const_offset(dst_type); 574 575 if (src_shift < dst_shift) { 576 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; 577 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); 578 579 for (i = 0; i < num_tmps; ++i) { 580 pre_shift[i] = tmp[i]; 581 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); 582 } 583 584 /* Compensate for different offsets */ 585 if (dst_offset > src_offset) { 586 for (i = 0; i < num_tmps; ++i) { 587 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); 588 } 589 } 590 } 591 } 592 593 for(i = 0; i < num_dsts; ++i) { 594 dst[i] = tmp[i]; 595 assert(lp_check_value(dst_type, dst[i])); 596 } 597} 598 599 600/** 601 * Bit mask conversion. 602 * 603 * This will convert the integer masks that match the given types. 604 * 605 * The mask values should 0 or -1, i.e., all bits either set to zero or one. 606 * Any other value will likely cause in unpredictable results. 607 * 608 * This is basically a very trimmed down version of lp_build_conv. 609 */ 610void 611lp_build_conv_mask(struct gallivm_state *gallivm, 612 struct lp_type src_type, 613 struct lp_type dst_type, 614 const LLVMValueRef *src, unsigned num_srcs, 615 LLVMValueRef *dst, unsigned num_dsts) 616{ 617 /* Register width must remain constant */ 618 assert(src_type.width * src_type.length == dst_type.width * dst_type.length); 619 620 /* We must not loose or gain channels. Only precision */ 621 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 622 623 /* 624 * Drop 625 * 626 * We assume all values are 0 or -1 627 */ 628 629 src_type.floating = FALSE; 630 src_type.fixed = FALSE; 631 src_type.sign = TRUE; 632 src_type.norm = FALSE; 633 634 dst_type.floating = FALSE; 635 dst_type.fixed = FALSE; 636 dst_type.sign = TRUE; 637 dst_type.norm = FALSE; 638 639 /* 640 * Truncate or expand bit width 641 */ 642 643 if(src_type.width > dst_type.width) { 644 assert(num_dsts == 1); 645 dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); 646 } 647 else if(src_type.width < dst_type.width) { 648 assert(num_srcs == 1); 649 lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts); 650 } 651 else { 652 assert(num_srcs == num_dsts); 653 memcpy(dst, src, num_dsts * sizeof *dst); 654 } 655} 656