1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper functions for type conversions. 32 * 33 * We want to use the fastest type for a given computation whenever feasible. 34 * The other side of this is that we need to be able convert between several 35 * types accurately and efficiently. 36 * 37 * Conversion between types of different bit width is quite complex since a 38 * 39 * To remember there are a few invariants in type conversions: 40 * 41 * - register width must remain constant: 42 * 43 * src_type.width * src_type.length == dst_type.width * dst_type.length 44 * 45 * - total number of elements must remain constant: 46 * 47 * src_type.length * num_srcs == dst_type.length * num_dsts 48 * 49 * It is not always possible to do the conversion both accurately and 50 * efficiently, usually due to lack of adequate machine instructions. In these 51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as 52 * there this functions can be used anywhere. In the future we might have a 53 * precision parameter which can gauge the accuracy vs efficiency compromise, 54 * but for now if the data conversion between two stages happens to be the 55 * bottleneck, then most likely should just avoid converting at all and run 56 * both stages with the same type. 57 * 58 * Make sure to run lp_test_conv unit test after any change to this file. 59 * 60 * @author Jose Fonseca <jfonseca@vmware.com> 61 */ 62 63 64#include "util/u_debug.h" 65#include "util/u_math.h" 66#include "util/u_cpu_detect.h" 67 68#include "lp_bld_type.h" 69#include "lp_bld_const.h" 70#include "lp_bld_arit.h" 71#include "lp_bld_pack.h" 72#include "lp_bld_conv.h" 73#include "lp_bld_logic.h" 74 75 76/** 77 * Converts int16 half-float to float32 78 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?) 79 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] 80 * 81 * @param src_type <vector> type of int16 82 * @param src value to convert 83 * 84 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ 85 */ 86LLVMValueRef 87lp_build_half_to_float(struct gallivm_state *gallivm, 88 struct lp_type src_type, 89 LLVMValueRef src) 90{ 91 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length); 92 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length); 93 94 LLVMBuilderRef builder = gallivm->builder; 95 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); 96 LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type); 97 98 /* Constants */ 99 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13); 100 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16); 101 LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff); 102 LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff); 103 LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); 104 LLVMValueRef f32_magic = LLVMBuildBitCast(builder, 105 lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23), 106 float_vec_type, ""); 107 108 /* Convert int16 vector to int32 vector by zero ext */ 109 LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, ""); 110 111 /* Exponent / mantissa bits */ 112 LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, ""); 113 LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, ""); 114 115 /* Exponent adjust */ 116 LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, ""); 117 118 /* Make sure Inf/NaN survive */ 119 LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan); 120 LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, ""); 121 122 /* Sign bit */ 123 LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, ""); 124 LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, ""); 125 126 /* Combine result */ 127 LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, ""); 128 LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, ""); 129 130 /* Cast from int32 vector to float32 vector */ 131 return LLVMBuildBitCast(builder, final, float_vec_type, ""); 132} 133 134 135/** 136 * Special case for converting clamped IEEE-754 floats to unsigned norms. 137 * 138 * The mathematical voodoo below may seem excessive but it is actually 139 * paramount we do it this way for several reasons. First, there is no single 140 * precision FP to unsigned integer conversion Intel SSE instruction. Second, 141 * secondly, even if there was, since the FP's mantissa takes only a fraction 142 * of register bits the typically scale and cast approach would require double 143 * precision for accurate results, and therefore half the throughput 144 * 145 * Although the result values can be scaled to an arbitrary bit width specified 146 * by dst_width, the actual result type will have the same width. 147 * 148 * Ex: src = { float, float, float, float } 149 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. 150 */ 151LLVMValueRef 152lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, 153 struct lp_type src_type, 154 unsigned dst_width, 155 LLVMValueRef src) 156{ 157 LLVMBuilderRef builder = gallivm->builder; 158 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); 159 LLVMValueRef res; 160 unsigned mantissa; 161 162 assert(src_type.floating); 163 assert(dst_width <= src_type.width); 164 src_type.sign = FALSE; 165 166 mantissa = lp_mantissa(src_type); 167 168 if (dst_width <= mantissa) { 169 /* 170 * Apply magic coefficients that will make the desired result to appear 171 * in the lowest significant bits of the mantissa, with correct rounding. 172 * 173 * This only works if the destination width fits in the mantissa. 174 */ 175 176 unsigned long long ubound; 177 unsigned long long mask; 178 double scale; 179 double bias; 180 181 ubound = (1ULL << dst_width); 182 mask = ubound - 1; 183 scale = (double)mask/ubound; 184 bias = (double)(1ULL << (mantissa - dst_width)); 185 186 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); 187 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); 188 res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 189 res = LLVMBuildAnd(builder, res, 190 lp_build_const_int_vec(gallivm, src_type, mask), ""); 191 } 192 else if (dst_width == (mantissa + 1)) { 193 /* 194 * The destination width matches exactly what can be represented in 195 * floating point (i.e., mantissa + 1 bits). So do a straight 196 * multiplication followed by casting. No further rounding is necessary. 197 */ 198 199 double scale; 200 201 scale = (double)((1ULL << dst_width) - 1); 202 203 res = LLVMBuildFMul(builder, src, 204 lp_build_const_vec(gallivm, src_type, scale), ""); 205 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 206 } 207 else { 208 /* 209 * The destination exceeds what can be represented in the floating point. 210 * So multiply by the largest power two we get away with, and when 211 * subtract the most significant bit to rescale to normalized values. 212 * 213 * The largest power of two factor we can get away is 214 * (1 << (src_type.width - 1)), because we need to use signed . In theory it 215 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states 216 * INT_MIN should be returned in FPToSI, which is the correct result for 217 * values near 1.0! 218 * 219 * This means we get (src_type.width - 1) correct bits for values near 0.0, 220 * and (mantissa + 1) correct bits for values near 1.0. Equally or more 221 * important, we also get exact results for 0.0 and 1.0. 222 */ 223 224 unsigned n = MIN2(src_type.width - 1, dst_width); 225 226 double scale = (double)(1ULL << n); 227 unsigned lshift = dst_width - n; 228 unsigned rshift = n; 229 LLVMValueRef lshifted; 230 LLVMValueRef rshifted; 231 232 res = LLVMBuildFMul(builder, src, 233 lp_build_const_vec(gallivm, src_type, scale), ""); 234 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 235 236 /* 237 * Align the most significant bit to its final place. 238 * 239 * This will cause 1.0 to overflow to 0, but the later adjustment will 240 * get it right. 241 */ 242 if (lshift) { 243 lshifted = LLVMBuildShl(builder, res, 244 lp_build_const_int_vec(gallivm, src_type, 245 lshift), ""); 246 } else { 247 lshifted = res; 248 } 249 250 /* 251 * Align the most significant bit to the right. 252 */ 253 rshifted = LLVMBuildLShr(builder, res, 254 lp_build_const_int_vec(gallivm, src_type, rshift), 255 ""); 256 257 /* 258 * Subtract the MSB to the LSB, therefore re-scaling from 259 * (1 << dst_width) to ((1 << dst_width) - 1). 260 */ 261 262 res = LLVMBuildSub(builder, lshifted, rshifted, ""); 263 } 264 265 return res; 266} 267 268 269/** 270 * Inverse of lp_build_clamped_float_to_unsigned_norm above. 271 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] 272 * return {float, float, float, float} with values in range [0, 1]. 273 */ 274LLVMValueRef 275lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, 276 unsigned src_width, 277 struct lp_type dst_type, 278 LLVMValueRef src) 279{ 280 LLVMBuilderRef builder = gallivm->builder; 281 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); 282 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); 283 LLVMValueRef bias_; 284 LLVMValueRef res; 285 unsigned mantissa; 286 unsigned n; 287 unsigned long long ubound; 288 unsigned long long mask; 289 double scale; 290 double bias; 291 292 assert(dst_type.floating); 293 294 mantissa = lp_mantissa(dst_type); 295 296 if (src_width <= (mantissa + 1)) { 297 /* 298 * The source width matches fits what can be represented in floating 299 * point (i.e., mantissa + 1 bits). So do a straight multiplication 300 * followed by casting. No further rounding is necessary. 301 */ 302 303 scale = 1.0/(double)((1ULL << src_width) - 1); 304 res = LLVMBuildSIToFP(builder, src, vec_type, ""); 305 res = LLVMBuildFMul(builder, res, 306 lp_build_const_vec(gallivm, dst_type, scale), ""); 307 return res; 308 } 309 else { 310 /* 311 * The source width exceeds what can be represented in floating 312 * point. So truncate the incoming values. 313 */ 314 315 n = MIN2(mantissa, src_width); 316 317 ubound = ((unsigned long long)1 << n); 318 mask = ubound - 1; 319 scale = (double)ubound/mask; 320 bias = (double)((unsigned long long)1 << (mantissa - n)); 321 322 res = src; 323 324 if (src_width > mantissa) { 325 int shift = src_width - mantissa; 326 res = LLVMBuildLShr(builder, res, 327 lp_build_const_int_vec(gallivm, dst_type, shift), ""); 328 } 329 330 bias_ = lp_build_const_vec(gallivm, dst_type, bias); 331 332 res = LLVMBuildOr(builder, 333 res, 334 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); 335 336 res = LLVMBuildBitCast(builder, res, vec_type, ""); 337 338 res = LLVMBuildFSub(builder, res, bias_, ""); 339 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); 340 } 341 342 return res; 343} 344 345 346/** 347 * Generic type conversion. 348 * 349 * TODO: Take a precision argument, or even better, add a new precision member 350 * to the lp_type union. 351 */ 352void 353lp_build_conv(struct gallivm_state *gallivm, 354 struct lp_type src_type, 355 struct lp_type dst_type, 356 const LLVMValueRef *src, unsigned num_srcs, 357 LLVMValueRef *dst, unsigned num_dsts) 358{ 359 LLVMBuilderRef builder = gallivm->builder; 360 struct lp_type tmp_type; 361 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 362 unsigned num_tmps; 363 unsigned i; 364 365 /* We must not loose or gain channels. Only precision */ 366 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 367 368 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 369 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 370 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 371 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 372 373 tmp_type = src_type; 374 for(i = 0; i < num_srcs; ++i) { 375 assert(lp_check_value(src_type, src[i])); 376 tmp[i] = src[i]; 377 } 378 num_tmps = num_srcs; 379 380 381 /* Special case 4x4f --> 1x16ub 382 */ 383 if (src_type.floating == 1 && 384 src_type.fixed == 0 && 385 src_type.sign == 1 && 386 src_type.norm == 0 && 387 src_type.width == 32 && 388 src_type.length == 4 && 389 390 dst_type.floating == 0 && 391 dst_type.fixed == 0 && 392 dst_type.sign == 0 && 393 dst_type.norm == 1 && 394 dst_type.width == 8 && 395 dst_type.length == 16 && 396 397 4 * num_dsts == num_srcs && 398 399 util_cpu_caps.has_sse2) 400 { 401 struct lp_build_context bld; 402 struct lp_type int16_type = dst_type; 403 struct lp_type int32_type = dst_type; 404 LLVMValueRef const_255f; 405 unsigned i, j; 406 407 lp_build_context_init(&bld, gallivm, src_type); 408 409 int16_type.width *= 2; 410 int16_type.length /= 2; 411 int16_type.sign = 1; 412 413 int32_type.width *= 4; 414 int32_type.length /= 4; 415 int32_type.sign = 1; 416 417 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); 418 419 for (i = 0; i < num_dsts; ++i, src += 4) { 420 LLVMValueRef lo, hi; 421 422 for (j = 0; j < 4; ++j) { 423 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); 424 tmp[j] = lp_build_iround(&bld, tmp[j]); 425 } 426 427 /* relying on clamping behavior of sse2 intrinsics here */ 428 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); 429 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); 430 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); 431 } 432 433 return; 434 } 435 436 /* Special case 2x8f --> 1x16ub 437 */ 438 else if (src_type.floating == 1 && 439 src_type.fixed == 0 && 440 src_type.sign == 1 && 441 src_type.norm == 0 && 442 src_type.width == 32 && 443 src_type.length == 8 && 444 445 dst_type.floating == 0 && 446 dst_type.fixed == 0 && 447 dst_type.sign == 0 && 448 dst_type.norm == 1 && 449 dst_type.width == 8 && 450 dst_type.length == 16 && 451 452 2 * num_dsts == num_srcs && 453 454 util_cpu_caps.has_avx) { 455 456 struct lp_build_context bld; 457 struct lp_type int16_type = dst_type; 458 struct lp_type int32_type = dst_type; 459 LLVMValueRef const_255f; 460 unsigned i; 461 462 lp_build_context_init(&bld, gallivm, src_type); 463 464 int16_type.width *= 2; 465 int16_type.length /= 2; 466 int16_type.sign = 1; 467 468 int32_type.width *= 4; 469 int32_type.length /= 4; 470 int32_type.sign = 1; 471 472 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); 473 474 for (i = 0; i < num_dsts; ++i, src += 2) { 475 LLVMValueRef lo, hi, a, b; 476 477 a = LLVMBuildFMul(builder, src[0], const_255f, ""); 478 b = LLVMBuildFMul(builder, src[1], const_255f, ""); 479 480 a = lp_build_iround(&bld, a); 481 b = lp_build_iround(&bld, b); 482 483 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); 484 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); 485 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); 486 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); 487 488 /* relying on clamping behavior of sse2 intrinsics here */ 489 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); 490 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); 491 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); 492 } 493 return; 494 } 495 496 /* Pre convert half-floats to floats 497 */ 498 else if (src_type.floating && src_type.width == 16) 499 { 500 for(i = 0; i < num_tmps; ++i) 501 tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); 502 503 tmp_type.width = 32; 504 } 505 506 /* 507 * Clamp if necessary 508 */ 509 510 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { 511 struct lp_build_context bld; 512 double src_min = lp_const_min(src_type); 513 double dst_min = lp_const_min(dst_type); 514 double src_max = lp_const_max(src_type); 515 double dst_max = lp_const_max(dst_type); 516 LLVMValueRef thres; 517 518 lp_build_context_init(&bld, gallivm, tmp_type); 519 520 if(src_min < dst_min) { 521 if(dst_min == 0.0) 522 thres = bld.zero; 523 else 524 thres = lp_build_const_vec(gallivm, src_type, dst_min); 525 for(i = 0; i < num_tmps; ++i) 526 tmp[i] = lp_build_max(&bld, tmp[i], thres); 527 } 528 529 if(src_max > dst_max) { 530 if(dst_max == 1.0) 531 thres = bld.one; 532 else 533 thres = lp_build_const_vec(gallivm, src_type, dst_max); 534 for(i = 0; i < num_tmps; ++i) 535 tmp[i] = lp_build_min(&bld, tmp[i], thres); 536 } 537 } 538 539 /* 540 * Scale to the narrowest range 541 */ 542 543 if(dst_type.floating) { 544 /* Nothing to do */ 545 } 546 else if(tmp_type.floating) { 547 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { 548 for(i = 0; i < num_tmps; ++i) { 549 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, 550 tmp_type, 551 dst_type.width, 552 tmp[i]); 553 } 554 tmp_type.floating = FALSE; 555 } 556 else { 557 double dst_scale = lp_const_scale(dst_type); 558 LLVMTypeRef tmp_vec_type; 559 560 if (dst_scale != 1.0) { 561 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); 562 for(i = 0; i < num_tmps; ++i) 563 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 564 } 565 566 /* Use an equally sized integer for intermediate computations */ 567 tmp_type.floating = FALSE; 568 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 569 for(i = 0; i < num_tmps; ++i) { 570#if 0 571 if(dst_type.sign) 572 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 573 else 574 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); 575#else 576 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ 577 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 578#endif 579 } 580 } 581 } 582 else { 583 unsigned src_shift = lp_const_shift(src_type); 584 unsigned dst_shift = lp_const_shift(dst_type); 585 unsigned src_offset = lp_const_offset(src_type); 586 unsigned dst_offset = lp_const_offset(dst_type); 587 588 /* Compensate for different offsets */ 589 if (dst_offset > src_offset && src_type.width > dst_type.width) { 590 for (i = 0; i < num_tmps; ++i) { 591 LLVMValueRef shifted; 592 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1); 593 if(src_type.sign) 594 shifted = LLVMBuildAShr(builder, tmp[i], shift, ""); 595 else 596 shifted = LLVMBuildLShr(builder, tmp[i], shift, ""); 597 598 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); 599 } 600 } 601 602 if(src_shift > dst_shift) { 603 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 604 src_shift - dst_shift); 605 for(i = 0; i < num_tmps; ++i) 606 if(src_type.sign) 607 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); 608 else 609 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); 610 } 611 } 612 613 /* 614 * Truncate or expand bit width 615 * 616 * No data conversion should happen here, although the sign bits are 617 * crucial to avoid bad clamping. 618 */ 619 620 { 621 struct lp_type new_type; 622 623 new_type = tmp_type; 624 new_type.sign = dst_type.sign; 625 new_type.width = dst_type.width; 626 new_type.length = dst_type.length; 627 628 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); 629 630 tmp_type = new_type; 631 num_tmps = num_dsts; 632 } 633 634 /* 635 * Scale to the widest range 636 */ 637 638 if(src_type.floating) { 639 /* Nothing to do */ 640 } 641 else if(!src_type.floating && dst_type.floating) { 642 if(!src_type.fixed && !src_type.sign && src_type.norm) { 643 for(i = 0; i < num_tmps; ++i) { 644 tmp[i] = lp_build_unsigned_norm_to_float(gallivm, 645 src_type.width, 646 dst_type, 647 tmp[i]); 648 } 649 tmp_type.floating = TRUE; 650 } 651 else { 652 double src_scale = lp_const_scale(src_type); 653 LLVMTypeRef tmp_vec_type; 654 655 /* Use an equally sized integer for intermediate computations */ 656 tmp_type.floating = TRUE; 657 tmp_type.sign = TRUE; 658 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 659 for(i = 0; i < num_tmps; ++i) { 660#if 0 661 if(dst_type.sign) 662 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 663 else 664 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); 665#else 666 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ 667 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 668#endif 669 } 670 671 if (src_scale != 1.0) { 672 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); 673 for(i = 0; i < num_tmps; ++i) 674 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 675 } 676 } 677 } 678 else { 679 unsigned src_shift = lp_const_shift(src_type); 680 unsigned dst_shift = lp_const_shift(dst_type); 681 unsigned src_offset = lp_const_offset(src_type); 682 unsigned dst_offset = lp_const_offset(dst_type); 683 684 if (src_shift < dst_shift) { 685 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; 686 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); 687 688 for (i = 0; i < num_tmps; ++i) { 689 pre_shift[i] = tmp[i]; 690 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); 691 } 692 693 /* Compensate for different offsets */ 694 if (dst_offset > src_offset) { 695 for (i = 0; i < num_tmps; ++i) { 696 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); 697 } 698 } 699 } 700 } 701 702 for(i = 0; i < num_dsts; ++i) { 703 dst[i] = tmp[i]; 704 assert(lp_check_value(dst_type, dst[i])); 705 } 706} 707 708 709/** 710 * Bit mask conversion. 711 * 712 * This will convert the integer masks that match the given types. 713 * 714 * The mask values should 0 or -1, i.e., all bits either set to zero or one. 715 * Any other value will likely cause unpredictable results. 716 * 717 * This is basically a very trimmed down version of lp_build_conv. 718 */ 719void 720lp_build_conv_mask(struct gallivm_state *gallivm, 721 struct lp_type src_type, 722 struct lp_type dst_type, 723 const LLVMValueRef *src, unsigned num_srcs, 724 LLVMValueRef *dst, unsigned num_dsts) 725{ 726 727 /* We must not loose or gain channels. Only precision */ 728 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 729 730 /* 731 * Drop 732 * 733 * We assume all values are 0 or -1 734 */ 735 736 src_type.floating = FALSE; 737 src_type.fixed = FALSE; 738 src_type.sign = TRUE; 739 src_type.norm = FALSE; 740 741 dst_type.floating = FALSE; 742 dst_type.fixed = FALSE; 743 dst_type.sign = TRUE; 744 dst_type.norm = FALSE; 745 746 /* 747 * Truncate or expand bit width 748 */ 749 750 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); 751} 752