1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper functions for type conversions. 32 * 33 * We want to use the fastest type for a given computation whenever feasible. 34 * The other side of this is that we need to be able convert between several 35 * types accurately and efficiently. 36 * 37 * Conversion between types of different bit width is quite complex since a 38 * 39 * To remember there are a few invariants in type conversions: 40 * 41 * - register width must remain constant: 42 * 43 * src_type.width * src_type.length == dst_type.width * dst_type.length 44 * 45 * - total number of elements must remain constant: 46 * 47 * src_type.length * num_srcs == dst_type.length * num_dsts 48 * 49 * It is not always possible to do the conversion both accurately and 50 * efficiently, usually due to lack of adequate machine instructions. In these 51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as 52 * there this functions can be used anywhere. In the future we might have a 53 * precision parameter which can gauge the accuracy vs efficiency compromise, 54 * but for now if the data conversion between two stages happens to be the 55 * bottleneck, then most likely should just avoid converting at all and run 56 * both stages with the same type. 57 * 58 * Make sure to run lp_test_conv unit test after any change to this file. 59 * 60 * @author Jose Fonseca <jfonseca@vmware.com> 61 */ 62 63 64#include "util/u_debug.h" 65#include "util/u_math.h" 66#include "util/u_half.h" 67#include "util/u_cpu_detect.h" 68 69#include "lp_bld_type.h" 70#include "lp_bld_const.h" 71#include "lp_bld_arit.h" 72#include "lp_bld_bitarit.h" 73#include "lp_bld_pack.h" 74#include "lp_bld_conv.h" 75#include "lp_bld_logic.h" 76#include "lp_bld_intr.h" 77#include "lp_bld_printf.h" 78#include "lp_bld_format.h" 79 80 81 82/** 83 * Converts int16 half-float to float32 84 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16) 85 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] 86 * 87 * @param src value to convert 88 * 89 */ 90LLVMValueRef 91lp_build_half_to_float(struct gallivm_state *gallivm, 92 LLVMValueRef src) 93{ 94 LLVMBuilderRef builder = gallivm->builder; 95 LLVMTypeRef src_type = LLVMTypeOf(src); 96 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? 97 LLVMGetVectorSize(src_type) : 1; 98 99 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); 100 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); 101 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); 102 LLVMValueRef h; 103 104 if (util_cpu_caps.has_f16c && 105 (src_length == 4 || src_length == 8)) { 106 const char *intrinsic = NULL; 107 if (src_length == 4) { 108 src = lp_build_pad_vector(gallivm, src, 8); 109 intrinsic = "llvm.x86.vcvtph2ps.128"; 110 } 111 else { 112 intrinsic = "llvm.x86.vcvtph2ps.256"; 113 } 114 return lp_build_intrinsic_unary(builder, intrinsic, 115 lp_build_vec_type(gallivm, f32_type), src); 116 } 117 118 /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */ 119 h = LLVMBuildZExt(builder, src, int_vec_type, ""); 120 return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true); 121} 122 123 124/** 125 * Converts float32 to int16 half-float 126 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16) 127 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph] 128 * 129 * @param src value to convert 130 * 131 * Convert float32 to half floats, preserving Infs and NaNs, 132 * with rounding towards zero (trunc). 133 * XXX: For GL, would prefer rounding towards nearest(-even). 134 */ 135LLVMValueRef 136lp_build_float_to_half(struct gallivm_state *gallivm, 137 LLVMValueRef src) 138{ 139 LLVMBuilderRef builder = gallivm->builder; 140 LLVMTypeRef f32_vec_type = LLVMTypeOf(src); 141 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind 142 ? LLVMGetVectorSize(f32_vec_type) : 1; 143 struct lp_type i32_type = lp_type_int_vec(32, 32 * length); 144 struct lp_type i16_type = lp_type_int_vec(16, 16 * length); 145 LLVMValueRef result; 146 147 /* 148 * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits 149 * directly, without any (x86 or generic) intrinsics. 150 * Albeit the rounding mode cannot be specified (and is undefined, 151 * though in practice on x86 seems to do nearest-even but it may 152 * be dependent on instruction set support), so is essentially 153 * useless. 154 */ 155 156 if (util_cpu_caps.has_f16c && 157 (length == 4 || length == 8)) { 158 struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); 159 unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ 160 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 161 const char *intrinsic = NULL; 162 if (length == 4) { 163 intrinsic = "llvm.x86.vcvtps2ph.128"; 164 } 165 else { 166 intrinsic = "llvm.x86.vcvtps2ph.256"; 167 } 168 result = lp_build_intrinsic_binary(builder, intrinsic, 169 lp_build_vec_type(gallivm, i168_type), 170 src, LLVMConstInt(i32t, mode, 0)); 171 if (length == 4) { 172 result = lp_build_extract_range(gallivm, result, 0, 4); 173 } 174 } 175 176 else { 177 result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); 178 /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ 179 result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); 180 } 181 182 /* 183 * Debugging code. 184 */ 185 if (0) { 186 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); 187 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context); 188 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); 189 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length)); 190 unsigned i; 191 192 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0); 193 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half)); 194 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half"); 195 196 for (i = 0; i < length; ++i) { 197 LLVMValueRef index = LLVMConstInt(i32t, i, 0); 198 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, ""); 199#if 0 200 /* 201 * XXX: not really supported by backends. 202 * Even if they would now, rounding mode cannot be specified and 203 * is undefined. 204 */ 205 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32); 206#else 207 LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, ""); 208#endif 209 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, ""); 210 } 211 212 lp_build_print_value(gallivm, "src = ", src); 213 lp_build_print_value(gallivm, "llvm = ", result); 214 lp_build_print_value(gallivm, "util = ", ref_result); 215 lp_build_printf(gallivm, "\n"); 216 } 217 218 return result; 219} 220 221 222/** 223 * Special case for converting clamped IEEE-754 floats to unsigned norms. 224 * 225 * The mathematical voodoo below may seem excessive but it is actually 226 * paramount we do it this way for several reasons. First, there is no single 227 * precision FP to unsigned integer conversion Intel SSE instruction. Second, 228 * secondly, even if there was, since the FP's mantissa takes only a fraction 229 * of register bits the typically scale and cast approach would require double 230 * precision for accurate results, and therefore half the throughput 231 * 232 * Although the result values can be scaled to an arbitrary bit width specified 233 * by dst_width, the actual result type will have the same width. 234 * 235 * Ex: src = { float, float, float, float } 236 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. 237 */ 238LLVMValueRef 239lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, 240 struct lp_type src_type, 241 unsigned dst_width, 242 LLVMValueRef src) 243{ 244 LLVMBuilderRef builder = gallivm->builder; 245 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); 246 LLVMValueRef res; 247 unsigned mantissa; 248 249 assert(src_type.floating); 250 assert(dst_width <= src_type.width); 251 src_type.sign = FALSE; 252 253 mantissa = lp_mantissa(src_type); 254 255 if (dst_width <= mantissa) { 256 /* 257 * Apply magic coefficients that will make the desired result to appear 258 * in the lowest significant bits of the mantissa, with correct rounding. 259 * 260 * This only works if the destination width fits in the mantissa. 261 */ 262 263 unsigned long long ubound; 264 unsigned long long mask; 265 double scale; 266 double bias; 267 268 ubound = (1ULL << dst_width); 269 mask = ubound - 1; 270 scale = (double)mask/ubound; 271 bias = (double)(1ULL << (mantissa - dst_width)); 272 273 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); 274 /* instead of fadd/and could (with sse2) just use lp_build_iround */ 275 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); 276 res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 277 res = LLVMBuildAnd(builder, res, 278 lp_build_const_int_vec(gallivm, src_type, mask), ""); 279 } 280 else if (dst_width == (mantissa + 1)) { 281 /* 282 * The destination width matches exactly what can be represented in 283 * floating point (i.e., mantissa + 1 bits). Even so correct rounding 284 * still needs to be applied (only for numbers in [0.5-1.0] would 285 * conversion using truncation after scaling be sufficient). 286 */ 287 double scale; 288 struct lp_build_context uf32_bld; 289 290 lp_build_context_init(&uf32_bld, gallivm, src_type); 291 scale = (double)((1ULL << dst_width) - 1); 292 293 res = LLVMBuildFMul(builder, src, 294 lp_build_const_vec(gallivm, src_type, scale), ""); 295 res = lp_build_iround(&uf32_bld, res); 296 } 297 else { 298 /* 299 * The destination exceeds what can be represented in the floating point. 300 * So multiply by the largest power two we get away with, and when 301 * subtract the most significant bit to rescale to normalized values. 302 * 303 * The largest power of two factor we can get away is 304 * (1 << (src_type.width - 1)), because we need to use signed . In theory it 305 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states 306 * INT_MIN should be returned in FPToSI, which is the correct result for 307 * values near 1.0! 308 * 309 * This means we get (src_type.width - 1) correct bits for values near 0.0, 310 * and (mantissa + 1) correct bits for values near 1.0. Equally or more 311 * important, we also get exact results for 0.0 and 1.0. 312 */ 313 314 unsigned n = MIN2(src_type.width - 1u, dst_width); 315 316 double scale = (double)(1ULL << n); 317 unsigned lshift = dst_width - n; 318 unsigned rshift = n; 319 LLVMValueRef lshifted; 320 LLVMValueRef rshifted; 321 322 res = LLVMBuildFMul(builder, src, 323 lp_build_const_vec(gallivm, src_type, scale), ""); 324 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 325 326 /* 327 * Align the most significant bit to its final place. 328 * 329 * This will cause 1.0 to overflow to 0, but the later adjustment will 330 * get it right. 331 */ 332 if (lshift) { 333 lshifted = LLVMBuildShl(builder, res, 334 lp_build_const_int_vec(gallivm, src_type, 335 lshift), ""); 336 } else { 337 lshifted = res; 338 } 339 340 /* 341 * Align the most significant bit to the right. 342 */ 343 rshifted = LLVMBuildLShr(builder, res, 344 lp_build_const_int_vec(gallivm, src_type, rshift), 345 ""); 346 347 /* 348 * Subtract the MSB to the LSB, therefore re-scaling from 349 * (1 << dst_width) to ((1 << dst_width) - 1). 350 */ 351 352 res = LLVMBuildSub(builder, lshifted, rshifted, ""); 353 } 354 355 return res; 356} 357 358 359/** 360 * Inverse of lp_build_clamped_float_to_unsigned_norm above. 361 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] 362 * return {float, float, float, float} with values in range [0, 1]. 363 */ 364LLVMValueRef 365lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, 366 unsigned src_width, 367 struct lp_type dst_type, 368 LLVMValueRef src) 369{ 370 LLVMBuilderRef builder = gallivm->builder; 371 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); 372 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); 373 LLVMValueRef bias_; 374 LLVMValueRef res; 375 unsigned mantissa; 376 unsigned n; 377 unsigned long long ubound; 378 unsigned long long mask; 379 double scale; 380 double bias; 381 382 assert(dst_type.floating); 383 384 mantissa = lp_mantissa(dst_type); 385 386 if (src_width <= (mantissa + 1)) { 387 /* 388 * The source width matches fits what can be represented in floating 389 * point (i.e., mantissa + 1 bits). So do a straight multiplication 390 * followed by casting. No further rounding is necessary. 391 */ 392 393 scale = 1.0/(double)((1ULL << src_width) - 1); 394 res = LLVMBuildSIToFP(builder, src, vec_type, ""); 395 res = LLVMBuildFMul(builder, res, 396 lp_build_const_vec(gallivm, dst_type, scale), ""); 397 return res; 398 } 399 else { 400 /* 401 * The source width exceeds what can be represented in floating 402 * point. So truncate the incoming values. 403 */ 404 405 n = MIN2(mantissa, src_width); 406 407 ubound = ((unsigned long long)1 << n); 408 mask = ubound - 1; 409 scale = (double)ubound/mask; 410 bias = (double)((unsigned long long)1 << (mantissa - n)); 411 412 res = src; 413 414 if (src_width > mantissa) { 415 int shift = src_width - mantissa; 416 res = LLVMBuildLShr(builder, res, 417 lp_build_const_int_vec(gallivm, dst_type, shift), ""); 418 } 419 420 bias_ = lp_build_const_vec(gallivm, dst_type, bias); 421 422 res = LLVMBuildOr(builder, 423 res, 424 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); 425 426 res = LLVMBuildBitCast(builder, res, vec_type, ""); 427 428 res = LLVMBuildFSub(builder, res, bias_, ""); 429 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); 430 } 431 432 return res; 433} 434 435 436/** 437 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used. 438 * 439 * Returns the number of dsts created from src 440 */ 441int lp_build_conv_auto(struct gallivm_state *gallivm, 442 struct lp_type src_type, 443 struct lp_type* dst_type, 444 const LLVMValueRef *src, 445 unsigned num_srcs, 446 LLVMValueRef *dst) 447{ 448 unsigned i; 449 int num_dsts = num_srcs; 450 451 if (src_type.floating == dst_type->floating && 452 src_type.width == dst_type->width && 453 src_type.length == dst_type->length && 454 src_type.fixed == dst_type->fixed && 455 src_type.norm == dst_type->norm && 456 src_type.sign == dst_type->sign) 457 return num_dsts; 458 459 /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8 460 */ 461 if (src_type.norm == 0 && 462 src_type.width == 32 && 463 src_type.fixed == 0 && 464 465 dst_type->floating == 0 && 466 dst_type->fixed == 0 && 467 dst_type->width == 8 && 468 469 ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) || 470 (src_type.floating == 0 && dst_type->floating == 0 && 471 src_type.sign == dst_type->sign && dst_type->norm == 0))) { 472 473 /* Special case 4x4x32 --> 1x16x8 */ 474 if (src_type.length == 4 && 475 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) 476 { 477 num_dsts = (num_srcs + 3) / 4; 478 dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; 479 480 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 481 return num_dsts; 482 } 483 484 /* Special case 2x8x32 --> 1x16x8 */ 485 if (src_type.length == 8 && 486 util_cpu_caps.has_avx) 487 { 488 num_dsts = (num_srcs + 1) / 2; 489 dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; 490 491 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 492 return num_dsts; 493 } 494 } 495 496 /* lp_build_resize does not support M:N */ 497 if (src_type.width == dst_type->width) { 498 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); 499 } else { 500 /* 501 * If dst_width is 16 bits and src_width 32 and the dst vector size 502 * 64bit, try feeding 2 vectors at once so pack intrinsics can be used. 503 * (For AVX, this isn't needed, since we usually get 256bit src and 504 * 128bit dst vectors which works ok. If we do AVX2 pack this should 505 * be extended but need to be able to tell conversion code about pack 506 * ordering first.) 507 */ 508 unsigned ratio = 1; 509 if (src_type.width == 2 * dst_type->width && 510 src_type.length == dst_type->length && 511 dst_type->floating == 0 && (num_srcs % 2 == 0) && 512 dst_type->width * dst_type->length == 64) { 513 ratio = 2; 514 num_dsts /= 2; 515 dst_type->length *= 2; 516 } 517 for (i = 0; i < num_dsts; i++) { 518 lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1); 519 } 520 } 521 522 return num_dsts; 523} 524 525 526/** 527 * Generic type conversion. 528 * 529 * TODO: Take a precision argument, or even better, add a new precision member 530 * to the lp_type union. 531 */ 532void 533lp_build_conv(struct gallivm_state *gallivm, 534 struct lp_type src_type, 535 struct lp_type dst_type, 536 const LLVMValueRef *src, unsigned num_srcs, 537 LLVMValueRef *dst, unsigned num_dsts) 538{ 539 LLVMBuilderRef builder = gallivm->builder; 540 struct lp_type tmp_type; 541 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 542 unsigned num_tmps; 543 unsigned i; 544 545 /* We must not loose or gain channels. Only precision */ 546 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 547 548 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 549 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 550 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 551 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 552 553 tmp_type = src_type; 554 for(i = 0; i < num_srcs; ++i) { 555 assert(lp_check_value(src_type, src[i])); 556 tmp[i] = src[i]; 557 } 558 num_tmps = num_srcs; 559 560 561 /* 562 * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8 563 * Only float -> s/unorm8 and (u)int32->(u)int8. 564 * XXX: This should cover all interesting backend cases for 8 bit, 565 * but should use same strategy if dst is 16 bit. 566 */ 567 if (src_type.norm == 0 && 568 src_type.width == 32 && 569 src_type.length == 4 && 570 src_type.fixed == 0 && 571 572 dst_type.floating == 0 && 573 dst_type.fixed == 0 && 574 dst_type.width == 8 && 575 576 ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || 577 (src_type.floating == 0 && dst_type.floating == 0 && 578 src_type.sign == dst_type.sign && dst_type.norm == 0)) && 579 580 ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || 581 (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && 582 583 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) 584 { 585 struct lp_build_context bld; 586 struct lp_type int16_type, int32_type; 587 struct lp_type dst_type_ext = dst_type; 588 LLVMValueRef const_scale; 589 unsigned i, j; 590 591 lp_build_context_init(&bld, gallivm, src_type); 592 593 dst_type_ext.length = 16; 594 int16_type = int32_type = dst_type_ext; 595 596 int16_type.width *= 2; 597 int16_type.length /= 2; 598 int16_type.sign = 1; 599 600 int32_type.width *= 4; 601 int32_type.length /= 4; 602 int32_type.sign = 1; 603 604 const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); 605 606 for (i = 0; i < num_dsts; ++i, src += 4) { 607 LLVMValueRef lo, hi; 608 609 if (src_type.floating) { 610 for (j = 0; j < dst_type.length / 4; ++j) { 611 /* 612 * XXX This is not actually fully correct. The float to int 613 * conversion will produce 0x80000000 value for everything 614 * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq). 615 * Hence, NaNs and negatives will get clamped just fine to zero 616 * (relying on clamping pack behavior) when converting to unorm, 617 * however too large values (both finite and infinite) will also 618 * end up as zero, not 255. 619 * For snorm, for now we'll keep bug compatibility with generic 620 * conversion path (meaning too large values are fine, but 621 * NaNs get converted to -128 (purely by luck, as we don't 622 * specify nan behavior for the max there) instead of 0). 623 */ 624 if (dst_type.sign) { 625 tmp[j] = lp_build_min(&bld, bld.one, src[j]); 626 627 } 628 else { 629 if (0) { 630 tmp[j] = lp_build_min_ext(&bld, bld.one, src[j], 631 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 632 } 633 tmp[j] = src[j]; 634 } 635 tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, ""); 636 tmp[j] = lp_build_iround(&bld, tmp[j]); 637 } 638 } else { 639 for (j = 0; j < dst_type.length / 4; ++j) { 640 if (!dst_type.sign) { 641 /* 642 * Pack clamp is always signed->unsigned (or signed->signed). 643 * Hence need min. 644 */ 645 LLVMValueRef const_max; 646 const_max = lp_build_const_int_vec(gallivm, src_type, 255); 647 tmp[j] = lp_build_min(&bld, src[j], const_max); 648 } else { 649 tmp[j] = src[j]; 650 } 651 } 652 } 653 654 if (num_srcs == 1) { 655 tmp[1] = tmp[0]; 656 } 657 658 /* relying on clamping behavior of sse2 intrinsics here */ 659 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); 660 661 if (num_srcs < 4) { 662 hi = lo; 663 } 664 else { 665 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); 666 } 667 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); 668 } 669 if (num_srcs < 4) { 670 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); 671 } 672 673 return; 674 } 675 676 /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8 677 */ 678 else if (src_type.norm == 0 && 679 src_type.width == 32 && 680 src_type.length == 8 && 681 src_type.fixed == 0 && 682 683 dst_type.floating == 0 && 684 dst_type.fixed == 0 && 685 dst_type.width == 8 && 686 687 ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || 688 (src_type.floating == 0 && dst_type.floating == 0 && 689 src_type.sign == dst_type.sign && dst_type.norm == 0)) && 690 691 ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || 692 (num_dsts == 1 && dst_type.length * num_srcs == 8)) && 693 694 util_cpu_caps.has_avx) { 695 696 struct lp_build_context bld; 697 struct lp_type int16_type, int32_type; 698 struct lp_type dst_type_ext = dst_type; 699 LLVMValueRef const_scale; 700 unsigned i; 701 702 lp_build_context_init(&bld, gallivm, src_type); 703 704 dst_type_ext.length = 16; 705 int16_type = int32_type = dst_type_ext; 706 707 int16_type.width *= 2; 708 int16_type.length /= 2; 709 int16_type.sign = 1; 710 711 int32_type.width *= 4; 712 int32_type.length /= 4; 713 int32_type.sign = 1; 714 715 const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); 716 717 for (i = 0; i < num_dsts; ++i, src += 2) { 718 unsigned j; 719 for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) { 720 LLVMValueRef lo, hi, a; 721 722 a = src[j]; 723 if (src_type.floating) { 724 if (dst_type.sign) { 725 a = lp_build_min(&bld, bld.one, a); 726 727 } 728 else { 729 if (0) { 730 a = lp_build_min_ext(&bld, bld.one, a, 731 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 732 } 733 } 734 a = LLVMBuildFMul(builder, a, const_scale, ""); 735 a = lp_build_iround(&bld, a); 736 } else { 737 if (!dst_type.sign) { 738 LLVMValueRef const_max; 739 const_max = lp_build_const_int_vec(gallivm, src_type, 255); 740 a = lp_build_min(&bld, a, const_max); 741 } 742 } 743 lo = lp_build_extract_range(gallivm, a, 0, 4); 744 hi = lp_build_extract_range(gallivm, a, 4, 4); 745 /* relying on clamping behavior of sse2 intrinsics here */ 746 tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi); 747 } 748 749 if (num_srcs == 1) { 750 tmp[1] = tmp[0]; 751 } 752 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]); 753 } 754 755 if (num_srcs == 1) { 756 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); 757 } 758 759 return; 760 } 761 762 /* Special case -> 16bit half-float 763 */ 764 else if (dst_type.floating && dst_type.width == 16) 765 { 766 /* Only support src as 32bit float currently */ 767 assert(src_type.floating && src_type.width == 32); 768 769 for(i = 0; i < num_tmps; ++i) 770 dst[i] = lp_build_float_to_half(gallivm, tmp[i]); 771 772 return; 773 } 774 775 /* Pre convert half-floats to floats 776 */ 777 else if (src_type.floating && src_type.width == 16) 778 { 779 for(i = 0; i < num_tmps; ++i) 780 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]); 781 782 tmp_type.width = 32; 783 } 784 785 /* 786 * Clamp if necessary 787 */ 788 789 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { 790 struct lp_build_context bld; 791 double src_min = lp_const_min(src_type); 792 double dst_min = lp_const_min(dst_type); 793 double src_max = lp_const_max(src_type); 794 double dst_max = lp_const_max(dst_type); 795 LLVMValueRef thres; 796 797 lp_build_context_init(&bld, gallivm, tmp_type); 798 799 if(src_min < dst_min) { 800 if(dst_min == 0.0) 801 thres = bld.zero; 802 else 803 thres = lp_build_const_vec(gallivm, src_type, dst_min); 804 for(i = 0; i < num_tmps; ++i) 805 tmp[i] = lp_build_max(&bld, tmp[i], thres); 806 } 807 808 if(src_max > dst_max) { 809 if(dst_max == 1.0) 810 thres = bld.one; 811 else 812 thres = lp_build_const_vec(gallivm, src_type, dst_max); 813 for(i = 0; i < num_tmps; ++i) 814 tmp[i] = lp_build_min(&bld, tmp[i], thres); 815 } 816 } 817 818 /* 819 * Scale to the narrowest range 820 */ 821 822 if(dst_type.floating) { 823 /* Nothing to do */ 824 } 825 else if(tmp_type.floating) { 826 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { 827 for(i = 0; i < num_tmps; ++i) { 828 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, 829 tmp_type, 830 dst_type.width, 831 tmp[i]); 832 } 833 tmp_type.floating = FALSE; 834 } 835 else { 836 double dst_scale = lp_const_scale(dst_type); 837 838 if (dst_scale != 1.0) { 839 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); 840 for(i = 0; i < num_tmps; ++i) 841 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 842 } 843 844 /* 845 * these functions will use fptosi in some form which won't work 846 * with 32bit uint dst. Causes lp_test_conv failures though. 847 */ 848 if (0) 849 assert(dst_type.sign || dst_type.width < 32); 850 851 if (dst_type.sign && dst_type.norm && !dst_type.fixed) { 852 struct lp_build_context bld; 853 854 lp_build_context_init(&bld, gallivm, tmp_type); 855 for(i = 0; i < num_tmps; ++i) { 856 tmp[i] = lp_build_iround(&bld, tmp[i]); 857 } 858 tmp_type.floating = FALSE; 859 } 860 else { 861 LLVMTypeRef tmp_vec_type; 862 863 tmp_type.floating = FALSE; 864 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 865 for(i = 0; i < num_tmps; ++i) { 866#if 0 867 if(dst_type.sign) 868 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 869 else 870 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); 871#else 872 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ 873 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); 874#endif 875 } 876 } 877 } 878 } 879 else { 880 unsigned src_shift = lp_const_shift(src_type); 881 unsigned dst_shift = lp_const_shift(dst_type); 882 unsigned src_offset = lp_const_offset(src_type); 883 unsigned dst_offset = lp_const_offset(dst_type); 884 struct lp_build_context bld; 885 lp_build_context_init(&bld, gallivm, tmp_type); 886 887 /* Compensate for different offsets */ 888 /* sscaled -> unorm and similar would cause negative shift count, skip */ 889 if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) { 890 for (i = 0; i < num_tmps; ++i) { 891 LLVMValueRef shifted; 892 893 shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1); 894 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); 895 } 896 } 897 898 if(src_shift > dst_shift) { 899 for(i = 0; i < num_tmps; ++i) 900 tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift); 901 } 902 } 903 904 /* 905 * Truncate or expand bit width 906 * 907 * No data conversion should happen here, although the sign bits are 908 * crucial to avoid bad clamping. 909 */ 910 911 { 912 struct lp_type new_type; 913 914 new_type = tmp_type; 915 new_type.sign = dst_type.sign; 916 new_type.width = dst_type.width; 917 new_type.length = dst_type.length; 918 919 /* 920 * Note that resize when using packs can sometimes get min/max 921 * clamping for free. Should be able to exploit this... 922 */ 923 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); 924 925 tmp_type = new_type; 926 num_tmps = num_dsts; 927 } 928 929 /* 930 * Scale to the widest range 931 */ 932 933 if(src_type.floating) { 934 /* Nothing to do */ 935 } 936 else if(!src_type.floating && dst_type.floating) { 937 if(!src_type.fixed && !src_type.sign && src_type.norm) { 938 for(i = 0; i < num_tmps; ++i) { 939 tmp[i] = lp_build_unsigned_norm_to_float(gallivm, 940 src_type.width, 941 dst_type, 942 tmp[i]); 943 } 944 tmp_type.floating = TRUE; 945 } 946 else { 947 double src_scale = lp_const_scale(src_type); 948 LLVMTypeRef tmp_vec_type; 949 950 /* Use an equally sized integer for intermediate computations */ 951 tmp_type.floating = TRUE; 952 tmp_type.sign = TRUE; 953 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); 954 for(i = 0; i < num_tmps; ++i) { 955#if 0 956 if(dst_type.sign) 957 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 958 else 959 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); 960#else 961 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ 962 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); 963#endif 964 } 965 966 if (src_scale != 1.0) { 967 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); 968 for(i = 0; i < num_tmps; ++i) 969 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); 970 } 971 972 /* the formula above will produce value below -1.0 for most negative 973 * value but everything seems happy with that hence disable for now */ 974 if (0 && !src_type.fixed && src_type.norm && src_type.sign) { 975 struct lp_build_context bld; 976 977 lp_build_context_init(&bld, gallivm, dst_type); 978 for(i = 0; i < num_tmps; ++i) { 979 tmp[i] = lp_build_max(&bld, tmp[i], 980 lp_build_const_vec(gallivm, dst_type, -1.0f)); 981 } 982 } 983 } 984 } 985 else { 986 unsigned src_shift = lp_const_shift(src_type); 987 unsigned dst_shift = lp_const_shift(dst_type); 988 unsigned src_offset = lp_const_offset(src_type); 989 unsigned dst_offset = lp_const_offset(dst_type); 990 struct lp_build_context bld; 991 lp_build_context_init(&bld, gallivm, tmp_type); 992 993 if (src_shift < dst_shift) { 994 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; 995 996 if (dst_shift - src_shift < dst_type.width) { 997 for (i = 0; i < num_tmps; ++i) { 998 pre_shift[i] = tmp[i]; 999 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift); 1000 } 1001 } 1002 else { 1003 /* 1004 * This happens for things like sscaled -> unorm conversions. Shift 1005 * counts equal to bit width cause undefined results, so hack around it. 1006 */ 1007 for (i = 0; i < num_tmps; ++i) { 1008 pre_shift[i] = tmp[i]; 1009 tmp[i] = lp_build_zero(gallivm, dst_type); 1010 } 1011 } 1012 1013 /* Compensate for different offsets */ 1014 if (dst_offset > src_offset) { 1015 for (i = 0; i < num_tmps; ++i) { 1016 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); 1017 } 1018 } 1019 } 1020 } 1021 1022 for(i = 0; i < num_dsts; ++i) { 1023 dst[i] = tmp[i]; 1024 assert(lp_check_value(dst_type, dst[i])); 1025 } 1026} 1027 1028 1029/** 1030 * Bit mask conversion. 1031 * 1032 * This will convert the integer masks that match the given types. 1033 * 1034 * The mask values should 0 or -1, i.e., all bits either set to zero or one. 1035 * Any other value will likely cause unpredictable results. 1036 * 1037 * This is basically a very trimmed down version of lp_build_conv. 1038 */ 1039void 1040lp_build_conv_mask(struct gallivm_state *gallivm, 1041 struct lp_type src_type, 1042 struct lp_type dst_type, 1043 const LLVMValueRef *src, unsigned num_srcs, 1044 LLVMValueRef *dst, unsigned num_dsts) 1045{ 1046 1047 /* We must not loose or gain channels. Only precision */ 1048 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 1049 1050 /* 1051 * Drop 1052 * 1053 * We assume all values are 0 or -1 1054 */ 1055 1056 src_type.floating = FALSE; 1057 src_type.fixed = FALSE; 1058 src_type.sign = TRUE; 1059 src_type.norm = FALSE; 1060 1061 dst_type.floating = FALSE; 1062 dst_type.fixed = FALSE; 1063 dst_type.sign = TRUE; 1064 dst_type.norm = FALSE; 1065 1066 /* 1067 * Truncate or expand bit width 1068 */ 1069 1070 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); 1071} 1072