lp_bld_conv.c revision 57d4e922a62921e7a8cfb1023ce0f68af806d898
1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper functions for type conversions. 32 * 33 * We want to use the fastest type for a given computation whenever feasible. 34 * The other side of this is that we need to be able convert between several 35 * types accurately and efficiently. 36 * 37 * Conversion between types of different bit width is quite complex since a 38 * 39 * To remember there are a few invariants in type conversions: 40 * 41 * - register width must remain constant: 42 * 43 * src_type.width * src_type.length == dst_type.width * dst_type.length 44 * 45 * - total number of elements must remain constant: 46 * 47 * src_type.length * num_srcs == dst_type.length * num_dsts 48 * 49 * It is not always possible to do the conversion both accurately and 50 * efficiently, usually due to lack of adequate machine instructions. In these 51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as 52 * there this functions can be used anywhere. In the future we might have a 53 * precision parameter which can gauge the accuracy vs efficiency compromise, 54 * but for now if the data conversion between two stages happens to be the 55 * bottleneck, then most likely should just avoid converting at all and run 56 * both stages with the same type. 57 * 58 * Make sure to run lp_test_conv unit test after any change to this file. 59 * 60 * @author Jose Fonseca <jfonseca@vmware.com> 61 */ 62 63 64#include "util/u_debug.h" 65#include "util/u_math.h" 66#include "util/u_cpu_detect.h" 67 68#include "lp_bld_type.h" 69#include "lp_bld_const.h" 70#include "lp_bld_arit.h" 71#include "lp_bld_pack.h" 72#include "lp_bld_conv.h" 73 74 75/** 76 * Special case for converting clamped IEEE-754 floats to unsigned norms. 77 * 78 * The mathematical voodoo below may seem excessive but it is actually 79 * paramount we do it this way for several reasons. First, there is no single 80 * precision FP to unsigned integer conversion Intel SSE instruction. Second, 81 * secondly, even if there was, since the FP's mantissa takes only a fraction 82 * of register bits the typically scale and cast approach would require double 83 * precision for accurate results, and therefore half the throughput 84 * 85 * Although the result values can be scaled to an arbitrary bit width specified 86 * by dst_width, the actual result type will have the same width. 87 * 88 * Ex: src = { float, float, float, float } 89 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. 90 */ 91LLVMValueRef 92lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, 93 struct lp_type src_type, 94 unsigned dst_width, 95 LLVMValueRef src) 96{ 97 LLVMBuilderRef builder = gallivm->builder; 98 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); 99 LLVMValueRef res; 100 unsigned mantissa; 101 102 assert(src_type.floating); 103 assert(dst_width <= src_type.width); 104 src_type.sign = FALSE; 105 106 mantissa = lp_mantissa(src_type); 107 108 if (dst_width <= mantissa) { 109 /* 110 * Apply magic coefficients that will make the desired result to appear 111 * in the lowest significant bits of the mantissa, with correct rounding. 112 * 113 * This only works if the destination width fits in the mantissa. 114 */ 115 116 unsigned long long ubound; 117 unsigned long long mask; 118 double scale; 119 double bias; 120 121 ubound = (1ULL << dst_width); 122 mask = ubound - 1; 123 scale = (double)mask/ubound; 124 bias = (double)(1ULL << (mantissa - dst_width)); 125 126 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); 127 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); 128 res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 129 res = LLVMBuildAnd(builder, res, 130 lp_build_const_int_vec(gallivm, src_type, mask), ""); 131 } 132 else if (dst_width == (mantissa + 1)) { 133 /* 134 * The destination width matches exactly what can be represented in 135 * floating point (i.e., mantissa + 1 bits). So do a straight 136 * multiplication followed by casting. No further rounding is necessary. 137 */ 138 139 double scale; 140 141 scale = (double)((1ULL << dst_width) - 1); 142 143 res = LLVMBuildFMul(builder, src, 144 lp_build_const_vec(gallivm, src_type, scale), ""); 145 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 146 } 147 else { 148 /* 149 * The destination exceeds what can be represented in the floating point. 150 * So multiply by the largest power two we get away with, and when 151 * subtract the most significant bit to rescale to normalized values. 152 * 153 * The largest power of two factor we can get away is 154 * (1 << (src_type.width - 1)), because we need to use signed . In theory it 155 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states 156 * INT_MIN should be returned in FPToSI, which is the correct result for 157 * values near 1.0! 158 * 159 * This means we get (src_type.width - 1) correct bits for values near 0.0, 160 * and (mantissa + 1) correct bits for values near 1.0. Equally or more 161 * important, we also get exact results for 0.0 and 1.0. 162 */ 163 164 unsigned n = MIN2(src_type.width - 1, dst_width); 165 166 double scale = (double)(1ULL << n); 167 unsigned lshift = dst_width - n; 168 unsigned rshift = n; 169 LLVMValueRef lshifted; 170 LLVMValueRef rshifted; 171 172 res = LLVMBuildFMul(builder, src, 173 lp_build_const_vec(gallivm, src_type, scale), ""); 174 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 175 176 /* 177 * Align the most significant bit to its final place. 178 * 179 * This will cause 1.0 to overflow to 0, but the later adjustment will 180 * get it right. 181 */ 182 if (lshift) { 183 lshifted = LLVMBuildShl(builder, res, 184 lp_build_const_int_vec(gallivm, src_type, 185 lshift), ""); 186 } else { 187 lshifted = res; 188 } 189 190 /* 191 * Align the most significant bit to the right. 192 */ 193 rshifted = LLVMBuildAShr(builder, res, 194 lp_build_const_int_vec(gallivm, src_type, rshift), 195 ""); 196 197 /* 198 * Subtract the MSB to the LSB, therefore re-scaling from 199 * (1 << dst_width) to ((1 << dst_width) - 1). 200 */ 201 202 res = LLVMBuildSub(builder, lshifted, rshifted, ""); 203 } 204 205 return res; 206} 207 208 209/** 210 * Inverse of lp_build_clamped_float_to_unsigned_norm above. 211 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] 212 * return {float, float, float, float} with values in range [0, 1]. 213 */ 214LLVMValueRef 215lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, 216 unsigned src_width, 217 struct lp_type dst_type, 218 LLVMValueRef src) 219{ 220 LLVMBuilderRef builder = gallivm->builder; 221 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); 222 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); 223 LLVMValueRef bias_; 224 LLVMValueRef res; 225 unsigned mantissa; 226 unsigned n; 227 unsigned long long ubound; 228 unsigned long long mask; 229 double scale; 230 double bias; 231 232 assert(dst_type.floating); 233 234 mantissa = lp_mantissa(dst_type); 235 236 if (src_width <= (mantissa + 1)) { 237 /* 238 * The source width matches fits what can be represented in floating 239 * point (i.e., mantissa + 1 bits). So do a straight multiplication 240 * followed by casting. No further rounding is necessary. 241 */ 242 243 scale = 1.0/(double)((1ULL << src_width) - 1); 244 res = LLVMBuildSIToFP(builder, src, vec_type, ""); 245 res = LLVMBuildFMul(builder, res, 246 lp_build_const_vec(gallivm, dst_type, scale), ""); 247 return res; 248 } 249 else { 250 /* 251 * The source width exceeds what can be represented in floating 252 * point. So truncate the incoming values. 253 */ 254 255 n = MIN2(mantissa, src_width); 256 257 ubound = ((unsigned long long)1 << n); 258 mask = ubound - 1; 259 scale = (double)ubound/mask; 260 bias = (double)((unsigned long long)1 << (mantissa - n)); 261 262 res = src; 263 264 if (src_width > mantissa) { 265 int shift = src_width - mantissa; 266 res = LLVMBuildLShr(builder, res, 267 lp_build_const_int_vec(gallivm, dst_type, shift), ""); 268 } 269 270 bias_ = lp_build_const_vec(gallivm, dst_type, bias); 271 272 res = LLVMBuildOr(builder, 273 res, 274 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); 275 276 res = LLVMBuildBitCast(builder, res, vec_type, ""); 277 278 res = LLVMBuildFSub(builder, res, bias_, ""); 279 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); 280 } 281 282 return res; 283} 284 285 286/** 287 * Generic type conversion. 288 * 289 * TODO: Take a precision argument, or even better, add a new precision member 290 * to the lp_type union. 291 */ 292void 293lp_build_conv(struct gallivm_state *gallivm, 294 struct lp_type src_type, 295 struct lp_type dst_type, 296 const LLVMValueRef *src, unsigned num_srcs, 297 LLVMValueRef *dst, unsigned num_dsts) 298{ 299 LLVMBuilderRef builder = gallivm->builder; 300 struct lp_type tmp_type; 301 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 302 unsigned num_tmps; 303 unsigned i; 304 305 /* We must not loose or gain channels. Only precision */ 306 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 307 308 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 309 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 310 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 311 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 312 313 tmp_type = src_type; 314 for(i = 0; i < num_srcs; ++i) { 315 assert(lp_check_value(src_type, src[i])); 316 tmp[i] = src[i]; 317 } 318 num_tmps = num_srcs; 319 320 321 /* Special case 4x4f --> 1x16ub 322 */ 323 if (src_type.floating == 1 && 324 src_type.fixed == 0 && 325 src_type.sign == 1 && 326 src_type.norm == 0 && 327 src_type.width == 32 && 328 src_type.length == 4 && 329 330 dst_type.floating == 0 && 331 dst_type.fixed == 0 && 332 dst_type.sign == 0 && 333 dst_type.norm == 1 && 334 dst_type.width == 8 && 335 dst_type.length == 16 && 336 337 util_cpu_caps.has_sse2) 338 { 339 int i; 340 341 for (i = 0; i < num_dsts; i++, src += 4) { 342 struct lp_type int16_type = dst_type; 343 struct lp_type int32_type = dst_type; 344 LLVMValueRef lo, hi; 345 LLVMValueRef src_int0; 346 LLVMValueRef src_int1; 347 LLVMValueRef src_int2; 348 LLVMValueRef src_int3; 349 LLVMTypeRef int16_vec_type; 350 LLVMTypeRef int32_vec_type; 351 LLVMTypeRef src_vec_type; 352 LLVMTypeRef dst_vec_type; 353 LLVMValueRef const_255f; 354 LLVMValueRef a, b, c, d; 355 356 int16_type.width *= 2; 357 int16_type.length /= 2; 358 int16_type.sign = 1; 359 360 int32_type.width *= 4; 361 int32_type.length /= 4; 362 int32_type.sign = 1; 363 364 src_vec_type = lp_build_vec_type(gallivm, src_type); 365 dst_vec_type = lp_build_vec_type(gallivm, dst_type); 366 int16_vec_type = lp_build_vec_type(gallivm, int16_type); 367 int32_vec_type = lp_build_vec_type(gallivm, int32_type); 368 369 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); 370 371 a = LLVMBuildFMul(builder, src[0], const_255f, ""); 372 b = LLVMBuildFMul(builder, src[1], const_255f, ""); 373 c = LLVMBuildFMul(builder, src[2], const_255f, ""); 374 d = LLVMBuildFMul(builder, src[3], const_255f, ""); 375 376 { 377 struct lp_build_context bld; 378 379 bld.gallivm = gallivm; 380 bld.type = src_type; 381 bld.vec_type = src_vec_type; 382 bld.int_elem_type = lp_build_elem_type(gallivm, int32_type); 383 bld.int_vec_type = int32_vec_type; 384 bld.undef = lp_build_undef(gallivm, src_type); 385 bld.zero = lp_build_zero(gallivm, src_type); 386 bld.one = lp_build_one(gallivm, src_type); 387 388 src_int0 = lp_build_iround(&bld, a); 389 src_int1 = lp_build_iround(&bld, b); 390 src_int2 = lp_build_iround(&bld, c); 391 src_int3 = lp_build_iround(&bld, d); 392 } 393 /* relying on clamping behavior of sse2 intrinsics here */ 394 lo = lp_build_pack2(gallivm, int32_type, int16_type, src_int0, src_int1); 395 hi = lp_build_pack2(gallivm, int32_type, int16_type, src_int2, src_int3); 396 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); 397 } 398 return; 399 } 400 401 /* 402 * Clamp if necessary 403 */ 404 405 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { 406 struct lp_build_context bld; 407 double src_min = lp_const_min(src_type); 408 double dst_min = lp_const_min(dst_type); 409 double src_max = lp_const_max(src_type); 410 double dst_max = lp_const_max(dst_type); 411 LLVMValueRef thres; 412 413 lp_build_context_init(&bld, gallivm, tmp_type); 414 415 if(src_min < dst_min) { 416 if(dst_min == 0.0) 417 thres = bld.zero; 418 else 419 thres = lp_build_const_vec(gallivm, src_type, dst_min); 420 for(i = 0; i < num_tmps; ++i) 421 tmp[i] = lp_build_max(&bld, tmp[i], thres); 422 } 423 424 if(src_max > dst_max) { 425 if(dst_max == 1.0) 426 thres = bld.one; 427 else 428 thres = lp_build_const_vec(gallivm, src_type, dst_max); 429 for(i = 0; i < num_tmps; ++i) 430 tmp[i] = lp_build_min(&bld, tmp[i], thres); 431 } 432 } 433 434 /* 435 * Scale to the narrowest range 436 */ 437 438 if(dst_type.floating) { 439 /* Nothing to do */ 440 } 441 else if(tmp_type.floating) { 442 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { 443 for(i = 0; i < num_tmps; ++i) { 444 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, 445 tmp_type, 446 dst_type.width, 447 tmp[i]); 448 } 449 tmp_type.floating = FALSE; 450 } 451 else { 452 double dst_scale = lp_const_scale(dst_type); 453 LLVMTypeRef tmp_vec_type; 454 455 if (dst_scale != 1.0) { 456 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); 457 for(i = 0; i < num_tmps; ++i) 458 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 459 } 460 461 /* Use an equally sized integer for intermediate computations */ 462 tmp_type.floating = FALSE; 463 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 464 for(i = 0; i < num_tmps; ++i) { 465#if 0 466 if(dst_type.sign) 467 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 468 else 469 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); 470#else 471 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ 472 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 473#endif 474 } 475 } 476 } 477 else { 478 unsigned src_shift = lp_const_shift(src_type); 479 unsigned dst_shift = lp_const_shift(dst_type); 480 481 /* FIXME: compensate different offsets too */ 482 if(src_shift > dst_shift) { 483 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 484 src_shift - dst_shift); 485 for(i = 0; i < num_tmps; ++i) 486 if(src_type.sign) 487 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); 488 else 489 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); 490 } 491 } 492 493 /* 494 * Truncate or expand bit width 495 * 496 * No data conversion should happen here, although the sign bits are 497 * crucial to avoid bad clamping. 498 */ 499 500 { 501 struct lp_type new_type; 502 503 new_type = tmp_type; 504 new_type.sign = dst_type.sign; 505 new_type.width = dst_type.width; 506 new_type.length = dst_type.length; 507 508 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); 509 510 tmp_type = new_type; 511 num_tmps = num_dsts; 512 } 513 514 /* 515 * Scale to the widest range 516 */ 517 518 if(src_type.floating) { 519 /* Nothing to do */ 520 } 521 else if(!src_type.floating && dst_type.floating) { 522 if(!src_type.fixed && !src_type.sign && src_type.norm) { 523 for(i = 0; i < num_tmps; ++i) { 524 tmp[i] = lp_build_unsigned_norm_to_float(gallivm, 525 src_type.width, 526 dst_type, 527 tmp[i]); 528 } 529 tmp_type.floating = TRUE; 530 } 531 else { 532 double src_scale = lp_const_scale(src_type); 533 LLVMTypeRef tmp_vec_type; 534 535 /* Use an equally sized integer for intermediate computations */ 536 tmp_type.floating = TRUE; 537 tmp_type.sign = TRUE; 538 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 539 for(i = 0; i < num_tmps; ++i) { 540#if 0 541 if(dst_type.sign) 542 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 543 else 544 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); 545#else 546 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ 547 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 548#endif 549 } 550 551 if (src_scale != 1.0) { 552 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); 553 for(i = 0; i < num_tmps; ++i) 554 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 555 } 556 } 557 } 558 else { 559 unsigned src_shift = lp_const_shift(src_type); 560 unsigned dst_shift = lp_const_shift(dst_type); 561 562 /* FIXME: compensate different offsets too */ 563 if(src_shift < dst_shift) { 564 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); 565 for(i = 0; i < num_tmps; ++i) 566 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); 567 } 568 } 569 570 for(i = 0; i < num_dsts; ++i) { 571 dst[i] = tmp[i]; 572 assert(lp_check_value(dst_type, dst[i])); 573 } 574} 575 576 577/** 578 * Bit mask conversion. 579 * 580 * This will convert the integer masks that match the given types. 581 * 582 * The mask values should 0 or -1, i.e., all bits either set to zero or one. 583 * Any other value will likely cause in unpredictable results. 584 * 585 * This is basically a very trimmed down version of lp_build_conv. 586 */ 587void 588lp_build_conv_mask(struct gallivm_state *gallivm, 589 struct lp_type src_type, 590 struct lp_type dst_type, 591 const LLVMValueRef *src, unsigned num_srcs, 592 LLVMValueRef *dst, unsigned num_dsts) 593{ 594 /* Register width must remain constant */ 595 assert(src_type.width * src_type.length == dst_type.width * dst_type.length); 596 597 /* We must not loose or gain channels. Only precision */ 598 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 599 600 /* 601 * Drop 602 * 603 * We assume all values are 0 or -1 604 */ 605 606 src_type.floating = FALSE; 607 src_type.fixed = FALSE; 608 src_type.sign = TRUE; 609 src_type.norm = FALSE; 610 611 dst_type.floating = FALSE; 612 dst_type.fixed = FALSE; 613 dst_type.sign = TRUE; 614 dst_type.norm = FALSE; 615 616 /* 617 * Truncate or expand bit width 618 */ 619 620 if(src_type.width > dst_type.width) { 621 assert(num_dsts == 1); 622 dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); 623 } 624 else if(src_type.width < dst_type.width) { 625 assert(num_srcs == 1); 626 lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts); 627 } 628 else { 629 assert(num_srcs == num_dsts); 630 memcpy(dst, src, num_dsts * sizeof *dst); 631 } 632} 633