lp_bld_arit.c revision 5161aff48af2fe0171be06fc727a000ad300fbd9
1/************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include "util/u_memory.h" 49#include "util/u_debug.h" 50#include "util/u_math.h" 51#include "util/u_string.h" 52#include "util/u_cpu_detect.h" 53 54#include "lp_bld_type.h" 55#include "lp_bld_const.h" 56#include "lp_bld_init.h" 57#include "lp_bld_intr.h" 58#include "lp_bld_logic.h" 59#include "lp_bld_pack.h" 60#include "lp_bld_debug.h" 61#include "lp_bld_arit.h" 62 63 64#define EXP_POLY_DEGREE 5 65 66#define LOG_POLY_DEGREE 5 67 68 69/** 70 * Generate min(a, b) 71 * No checks for special case values of a or b = 1 or 0 are done. 72 */ 73static LLVMValueRef 74lp_build_min_simple(struct lp_build_context *bld, 75 LLVMValueRef a, 76 LLVMValueRef b) 77{ 78 LLVMBuilderRef builder = bld->gallivm->builder; 79 const struct lp_type type = bld->type; 80 const char *intrinsic = NULL; 81 LLVMValueRef cond; 82 83 assert(lp_check_value(type, a)); 84 assert(lp_check_value(type, b)); 85 86 /* TODO: optimize the constant case */ 87 88 if(type.width * type.length == 128) { 89 if(type.floating) { 90 if(type.width == 32 && util_cpu_caps.has_sse) 91 intrinsic = "llvm.x86.sse.min.ps"; 92 if(type.width == 64 && util_cpu_caps.has_sse2) 93 intrinsic = "llvm.x86.sse2.min.pd"; 94 } 95 else { 96 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 97 intrinsic = "llvm.x86.sse2.pminu.b"; 98 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 99 intrinsic = "llvm.x86.sse41.pminsb"; 100 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 101 intrinsic = "llvm.x86.sse41.pminuw"; 102 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 103 intrinsic = "llvm.x86.sse2.pmins.w"; 104 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 105 intrinsic = "llvm.x86.sse41.pminud"; 106 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 107 intrinsic = "llvm.x86.sse41.pminsd"; 108 } 109 } 110 111 if(intrinsic) 112 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 113 114 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 115 return lp_build_select(bld, cond, a, b); 116} 117 118 119/** 120 * Generate max(a, b) 121 * No checks for special case values of a or b = 1 or 0 are done. 122 */ 123static LLVMValueRef 124lp_build_max_simple(struct lp_build_context *bld, 125 LLVMValueRef a, 126 LLVMValueRef b) 127{ 128 LLVMBuilderRef builder = bld->gallivm->builder; 129 const struct lp_type type = bld->type; 130 const char *intrinsic = NULL; 131 LLVMValueRef cond; 132 133 assert(lp_check_value(type, a)); 134 assert(lp_check_value(type, b)); 135 136 /* TODO: optimize the constant case */ 137 138 if(type.width * type.length == 128) { 139 if(type.floating) { 140 if(type.width == 32 && util_cpu_caps.has_sse) 141 intrinsic = "llvm.x86.sse.max.ps"; 142 if(type.width == 64 && util_cpu_caps.has_sse2) 143 intrinsic = "llvm.x86.sse2.max.pd"; 144 } 145 else { 146 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 147 intrinsic = "llvm.x86.sse2.pmaxu.b"; 148 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 149 intrinsic = "llvm.x86.sse41.pmaxsb"; 150 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 151 intrinsic = "llvm.x86.sse41.pmaxuw"; 152 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 153 intrinsic = "llvm.x86.sse2.pmaxs.w"; 154 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 155 intrinsic = "llvm.x86.sse41.pmaxud"; 156 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 157 intrinsic = "llvm.x86.sse41.pmaxsd"; 158 } 159 } 160 161 if(intrinsic) 162 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 163 164 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 165 return lp_build_select(bld, cond, a, b); 166} 167 168 169/** 170 * Generate 1 - a, or ~a depending on bld->type. 171 */ 172LLVMValueRef 173lp_build_comp(struct lp_build_context *bld, 174 LLVMValueRef a) 175{ 176 LLVMBuilderRef builder = bld->gallivm->builder; 177 const struct lp_type type = bld->type; 178 179 assert(lp_check_value(type, a)); 180 181 if(a == bld->one) 182 return bld->zero; 183 if(a == bld->zero) 184 return bld->one; 185 186 if(type.norm && !type.floating && !type.fixed && !type.sign) { 187 if(LLVMIsConstant(a)) 188 return LLVMConstNot(a); 189 else 190 return LLVMBuildNot(builder, a, ""); 191 } 192 193 if(LLVMIsConstant(a)) 194 if (type.floating) 195 return LLVMConstFSub(bld->one, a); 196 else 197 return LLVMConstSub(bld->one, a); 198 else 199 if (type.floating) 200 return LLVMBuildFSub(builder, bld->one, a, ""); 201 else 202 return LLVMBuildSub(builder, bld->one, a, ""); 203} 204 205 206/** 207 * Generate a + b 208 */ 209LLVMValueRef 210lp_build_add(struct lp_build_context *bld, 211 LLVMValueRef a, 212 LLVMValueRef b) 213{ 214 LLVMBuilderRef builder = bld->gallivm->builder; 215 const struct lp_type type = bld->type; 216 LLVMValueRef res; 217 218 assert(lp_check_value(type, a)); 219 assert(lp_check_value(type, b)); 220 221 if(a == bld->zero) 222 return b; 223 if(b == bld->zero) 224 return a; 225 if(a == bld->undef || b == bld->undef) 226 return bld->undef; 227 228 if(bld->type.norm) { 229 const char *intrinsic = NULL; 230 231 if(a == bld->one || b == bld->one) 232 return bld->one; 233 234 if(util_cpu_caps.has_sse2 && 235 type.width * type.length == 128 && 236 !type.floating && !type.fixed) { 237 if(type.width == 8) 238 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 239 if(type.width == 16) 240 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 241 } 242 243 if(intrinsic) 244 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 245 } 246 247 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 248 if (type.floating) 249 res = LLVMConstFAdd(a, b); 250 else 251 res = LLVMConstAdd(a, b); 252 else 253 if (type.floating) 254 res = LLVMBuildFAdd(builder, a, b, ""); 255 else 256 res = LLVMBuildAdd(builder, a, b, ""); 257 258 /* clamp to ceiling of 1.0 */ 259 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 260 res = lp_build_min_simple(bld, res, bld->one); 261 262 /* XXX clamp to floor of -1 or 0??? */ 263 264 return res; 265} 266 267 268/** Return the scalar sum of the elements of a */ 269LLVMValueRef 270lp_build_sum_vector(struct lp_build_context *bld, 271 LLVMValueRef a) 272{ 273 LLVMBuilderRef builder = bld->gallivm->builder; 274 const struct lp_type type = bld->type; 275 LLVMValueRef index, res; 276 unsigned i; 277 278 assert(lp_check_value(type, a)); 279 280 if (type.length == 1) { 281 return a; 282 } 283 284 assert(!bld->type.norm); 285 286 index = lp_build_const_int32(bld->gallivm, 0); 287 res = LLVMBuildExtractElement(builder, a, index, ""); 288 289 for (i = 1; i < type.length; i++) { 290 index = lp_build_const_int32(bld->gallivm, i); 291 if (type.floating) 292 res = LLVMBuildFAdd(builder, res, 293 LLVMBuildExtractElement(builder, 294 a, index, ""), 295 ""); 296 else 297 res = LLVMBuildAdd(builder, res, 298 LLVMBuildExtractElement(builder, 299 a, index, ""), 300 ""); 301 } 302 303 return res; 304} 305 306 307/** 308 * Generate a - b 309 */ 310LLVMValueRef 311lp_build_sub(struct lp_build_context *bld, 312 LLVMValueRef a, 313 LLVMValueRef b) 314{ 315 LLVMBuilderRef builder = bld->gallivm->builder; 316 const struct lp_type type = bld->type; 317 LLVMValueRef res; 318 319 assert(lp_check_value(type, a)); 320 assert(lp_check_value(type, b)); 321 322 if(b == bld->zero) 323 return a; 324 if(a == bld->undef || b == bld->undef) 325 return bld->undef; 326 if(a == b) 327 return bld->zero; 328 329 if(bld->type.norm) { 330 const char *intrinsic = NULL; 331 332 if(b == bld->one) 333 return bld->zero; 334 335 if(util_cpu_caps.has_sse2 && 336 type.width * type.length == 128 && 337 !type.floating && !type.fixed) { 338 if(type.width == 8) 339 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 340 if(type.width == 16) 341 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 342 } 343 344 if(intrinsic) 345 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 346 } 347 348 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 349 if (type.floating) 350 res = LLVMConstFSub(a, b); 351 else 352 res = LLVMConstSub(a, b); 353 else 354 if (type.floating) 355 res = LLVMBuildFSub(builder, a, b, ""); 356 else 357 res = LLVMBuildSub(builder, a, b, ""); 358 359 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 360 res = lp_build_max_simple(bld, res, bld->zero); 361 362 return res; 363} 364 365 366/** 367 * Normalized 8bit multiplication. 368 * 369 * - alpha plus one 370 * 371 * makes the following approximation to the division (Sree) 372 * 373 * a*b/255 ~= (a*(b + 1)) >> 256 374 * 375 * which is the fastest method that satisfies the following OpenGL criteria 376 * 377 * 0*0 = 0 and 255*255 = 255 378 * 379 * - geometric series 380 * 381 * takes the geometric series approximation to the division 382 * 383 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 384 * 385 * in this case just the first two terms to fit in 16bit arithmetic 386 * 387 * t/255 ~= (t + (t >> 8)) >> 8 388 * 389 * note that just by itself it doesn't satisfies the OpenGL criteria, as 390 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 391 * must be used 392 * 393 * - geometric series plus rounding 394 * 395 * when using a geometric series division instead of truncating the result 396 * use roundoff in the approximation (Jim Blinn) 397 * 398 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 399 * 400 * achieving the exact results 401 * 402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 403 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 404 * @sa Michael Herf, The "double blend trick", May 2000, 405 * http://www.stereopsis.com/doubleblend.html 406 */ 407static LLVMValueRef 408lp_build_mul_u8n(struct gallivm_state *gallivm, 409 struct lp_type i16_type, 410 LLVMValueRef a, LLVMValueRef b) 411{ 412 LLVMBuilderRef builder = gallivm->builder; 413 LLVMValueRef c8; 414 LLVMValueRef ab; 415 416 assert(!i16_type.floating); 417 assert(lp_check_value(i16_type, a)); 418 assert(lp_check_value(i16_type, b)); 419 420 c8 = lp_build_const_int_vec(gallivm, i16_type, 8); 421 422#if 0 423 424 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 425 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), ""); 426 ab = LLVMBuildMul(builder, a, b, ""); 427 428#else 429 430 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 431 ab = LLVMBuildMul(builder, a, b, ""); 432 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 433 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), ""); 434 435#endif 436 437 ab = LLVMBuildLShr(builder, ab, c8, ""); 438 439 return ab; 440} 441 442 443/** 444 * Generate a * b 445 */ 446LLVMValueRef 447lp_build_mul(struct lp_build_context *bld, 448 LLVMValueRef a, 449 LLVMValueRef b) 450{ 451 LLVMBuilderRef builder = bld->gallivm->builder; 452 const struct lp_type type = bld->type; 453 LLVMValueRef shift; 454 LLVMValueRef res; 455 456 assert(lp_check_value(type, a)); 457 assert(lp_check_value(type, b)); 458 459 if(a == bld->zero) 460 return bld->zero; 461 if(a == bld->one) 462 return b; 463 if(b == bld->zero) 464 return bld->zero; 465 if(b == bld->one) 466 return a; 467 if(a == bld->undef || b == bld->undef) 468 return bld->undef; 469 470 if(!type.floating && !type.fixed && type.norm) { 471 if(type.width == 8) { 472 struct lp_type i16_type = lp_wider_type(type); 473 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 474 475 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah); 476 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh); 477 478 /* PMULLW, PSRLW, PADDW */ 479 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl); 480 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh); 481 482 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh); 483 484 return ab; 485 } 486 487 /* FIXME */ 488 assert(0); 489 } 490 491 if(type.fixed) 492 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 493 else 494 shift = NULL; 495 496 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 497 if (type.floating) 498 res = LLVMConstFMul(a, b); 499 else 500 res = LLVMConstMul(a, b); 501 if(shift) { 502 if(type.sign) 503 res = LLVMConstAShr(res, shift); 504 else 505 res = LLVMConstLShr(res, shift); 506 } 507 } 508 else { 509 if (type.floating) 510 res = LLVMBuildFMul(builder, a, b, ""); 511 else 512 res = LLVMBuildMul(builder, a, b, ""); 513 if(shift) { 514 if(type.sign) 515 res = LLVMBuildAShr(builder, res, shift, ""); 516 else 517 res = LLVMBuildLShr(builder, res, shift, ""); 518 } 519 } 520 521 return res; 522} 523 524 525/** 526 * Small vector x scale multiplication optimization. 527 */ 528LLVMValueRef 529lp_build_mul_imm(struct lp_build_context *bld, 530 LLVMValueRef a, 531 int b) 532{ 533 LLVMBuilderRef builder = bld->gallivm->builder; 534 LLVMValueRef factor; 535 536 assert(lp_check_value(bld->type, a)); 537 538 if(b == 0) 539 return bld->zero; 540 541 if(b == 1) 542 return a; 543 544 if(b == -1) 545 return lp_build_negate(bld, a); 546 547 if(b == 2 && bld->type.floating) 548 return lp_build_add(bld, a, a); 549 550 if(util_is_power_of_two(b)) { 551 unsigned shift = ffs(b) - 1; 552 553 if(bld->type.floating) { 554#if 0 555 /* 556 * Power of two multiplication by directly manipulating the mantissa. 557 * 558 * XXX: This might not be always faster, it will introduce a small error 559 * for multiplication by zero, and it will produce wrong results 560 * for Inf and NaN. 561 */ 562 unsigned mantissa = lp_mantissa(bld->type); 563 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 564 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 565 a = LLVMBuildAdd(builder, a, factor, ""); 566 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 567 return a; 568#endif 569 } 570 else { 571 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 572 return LLVMBuildShl(builder, a, factor, ""); 573 } 574 } 575 576 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 577 return lp_build_mul(bld, a, factor); 578} 579 580 581/** 582 * Generate a / b 583 */ 584LLVMValueRef 585lp_build_div(struct lp_build_context *bld, 586 LLVMValueRef a, 587 LLVMValueRef b) 588{ 589 LLVMBuilderRef builder = bld->gallivm->builder; 590 const struct lp_type type = bld->type; 591 592 assert(lp_check_value(type, a)); 593 assert(lp_check_value(type, b)); 594 595 if(a == bld->zero) 596 return bld->zero; 597 if(a == bld->one) 598 return lp_build_rcp(bld, b); 599 if(b == bld->zero) 600 return bld->undef; 601 if(b == bld->one) 602 return a; 603 if(a == bld->undef || b == bld->undef) 604 return bld->undef; 605 606 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 607 if (type.floating) 608 return LLVMConstFDiv(a, b); 609 else if (type.sign) 610 return LLVMConstSDiv(a, b); 611 else 612 return LLVMConstUDiv(a, b); 613 } 614 615 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 616 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 617 618 if (type.floating) 619 return LLVMBuildFDiv(builder, a, b, ""); 620 else if (type.sign) 621 return LLVMBuildSDiv(builder, a, b, ""); 622 else 623 return LLVMBuildUDiv(builder, a, b, ""); 624} 625 626 627/** 628 * Linear interpolation -- without any checks. 629 * 630 * @sa http://www.stereopsis.com/doubleblend.html 631 */ 632static INLINE LLVMValueRef 633lp_build_lerp_simple(struct lp_build_context *bld, 634 LLVMValueRef x, 635 LLVMValueRef v0, 636 LLVMValueRef v1) 637{ 638 LLVMBuilderRef builder = bld->gallivm->builder; 639 LLVMValueRef delta; 640 LLVMValueRef res; 641 642 assert(lp_check_value(bld->type, x)); 643 assert(lp_check_value(bld->type, v0)); 644 assert(lp_check_value(bld->type, v1)); 645 646 delta = lp_build_sub(bld, v1, v0); 647 648 res = lp_build_mul(bld, x, delta); 649 650 res = lp_build_add(bld, v0, res); 651 652 if (bld->type.fixed) { 653 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 654 * but it will be wrong for other uses. Basically we need a more 655 * powerful lp_type, capable of further distinguishing the values 656 * interpretation from the value storage. */ 657 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), ""); 658 } 659 660 return res; 661} 662 663 664/** 665 * Linear interpolation. 666 */ 667LLVMValueRef 668lp_build_lerp(struct lp_build_context *bld, 669 LLVMValueRef x, 670 LLVMValueRef v0, 671 LLVMValueRef v1) 672{ 673 LLVMBuilderRef builder = bld->gallivm->builder; 674 const struct lp_type type = bld->type; 675 LLVMValueRef res; 676 677 assert(lp_check_value(type, x)); 678 assert(lp_check_value(type, v0)); 679 assert(lp_check_value(type, v1)); 680 681 if (type.norm) { 682 struct lp_type wide_type; 683 struct lp_build_context wide_bld; 684 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 685 LLVMValueRef shift; 686 687 assert(type.length >= 2); 688 assert(!type.sign); 689 690 /* 691 * Create a wider type, enough to hold the intermediate result of the 692 * multiplication. 693 */ 694 memset(&wide_type, 0, sizeof wide_type); 695 wide_type.fixed = TRUE; 696 wide_type.width = type.width*2; 697 wide_type.length = type.length/2; 698 699 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 700 701 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); 702 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 703 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 704 705 /* 706 * Scale x from [0, 255] to [0, 256] 707 */ 708 709 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1); 710 711 xl = lp_build_add(&wide_bld, xl, 712 LLVMBuildAShr(builder, xl, shift, "")); 713 xh = lp_build_add(&wide_bld, xh, 714 LLVMBuildAShr(builder, xh, shift, "")); 715 716 /* 717 * Lerp both halves. 718 */ 719 720 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); 721 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); 722 723 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); 724 } else { 725 res = lp_build_lerp_simple(bld, x, v0, v1); 726 } 727 728 return res; 729} 730 731 732LLVMValueRef 733lp_build_lerp_2d(struct lp_build_context *bld, 734 LLVMValueRef x, 735 LLVMValueRef y, 736 LLVMValueRef v00, 737 LLVMValueRef v01, 738 LLVMValueRef v10, 739 LLVMValueRef v11) 740{ 741 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 742 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 743 return lp_build_lerp(bld, y, v0, v1); 744} 745 746 747/** 748 * Generate min(a, b) 749 * Do checks for special cases. 750 */ 751LLVMValueRef 752lp_build_min(struct lp_build_context *bld, 753 LLVMValueRef a, 754 LLVMValueRef b) 755{ 756 assert(lp_check_value(bld->type, a)); 757 assert(lp_check_value(bld->type, b)); 758 759 if(a == bld->undef || b == bld->undef) 760 return bld->undef; 761 762 if(a == b) 763 return a; 764 765 if(bld->type.norm) { 766 if(a == bld->zero || b == bld->zero) 767 return bld->zero; 768 if(a == bld->one) 769 return b; 770 if(b == bld->one) 771 return a; 772 } 773 774 return lp_build_min_simple(bld, a, b); 775} 776 777 778/** 779 * Generate max(a, b) 780 * Do checks for special cases. 781 */ 782LLVMValueRef 783lp_build_max(struct lp_build_context *bld, 784 LLVMValueRef a, 785 LLVMValueRef b) 786{ 787 assert(lp_check_value(bld->type, a)); 788 assert(lp_check_value(bld->type, b)); 789 790 if(a == bld->undef || b == bld->undef) 791 return bld->undef; 792 793 if(a == b) 794 return a; 795 796 if(bld->type.norm) { 797 if(a == bld->one || b == bld->one) 798 return bld->one; 799 if(a == bld->zero) 800 return b; 801 if(b == bld->zero) 802 return a; 803 } 804 805 return lp_build_max_simple(bld, a, b); 806} 807 808 809/** 810 * Generate clamp(a, min, max) 811 * Do checks for special cases. 812 */ 813LLVMValueRef 814lp_build_clamp(struct lp_build_context *bld, 815 LLVMValueRef a, 816 LLVMValueRef min, 817 LLVMValueRef max) 818{ 819 assert(lp_check_value(bld->type, a)); 820 assert(lp_check_value(bld->type, min)); 821 assert(lp_check_value(bld->type, max)); 822 823 a = lp_build_min(bld, a, max); 824 a = lp_build_max(bld, a, min); 825 return a; 826} 827 828 829/** 830 * Generate abs(a) 831 */ 832LLVMValueRef 833lp_build_abs(struct lp_build_context *bld, 834 LLVMValueRef a) 835{ 836 LLVMBuilderRef builder = bld->gallivm->builder; 837 const struct lp_type type = bld->type; 838 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 839 840 assert(lp_check_value(type, a)); 841 842 if(!type.sign) 843 return a; 844 845 if(type.floating) { 846 /* Mask out the sign bit */ 847 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 848 unsigned long long absMask = ~(1ULL << (type.width - 1)); 849 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 850 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 851 a = LLVMBuildAnd(builder, a, mask, ""); 852 a = LLVMBuildBitCast(builder, a, vec_type, ""); 853 return a; 854 } 855 856 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 857 switch(type.width) { 858 case 8: 859 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 860 case 16: 861 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 862 case 32: 863 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 864 } 865 } 866 867 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 868} 869 870 871LLVMValueRef 872lp_build_negate(struct lp_build_context *bld, 873 LLVMValueRef a) 874{ 875 LLVMBuilderRef builder = bld->gallivm->builder; 876 877 assert(lp_check_value(bld->type, a)); 878 879#if HAVE_LLVM >= 0x0207 880 if (bld->type.floating) 881 a = LLVMBuildFNeg(builder, a, ""); 882 else 883#endif 884 a = LLVMBuildNeg(builder, a, ""); 885 886 return a; 887} 888 889 890/** Return -1, 0 or +1 depending on the sign of a */ 891LLVMValueRef 892lp_build_sgn(struct lp_build_context *bld, 893 LLVMValueRef a) 894{ 895 LLVMBuilderRef builder = bld->gallivm->builder; 896 const struct lp_type type = bld->type; 897 LLVMValueRef cond; 898 LLVMValueRef res; 899 900 assert(lp_check_value(type, a)); 901 902 /* Handle non-zero case */ 903 if(!type.sign) { 904 /* if not zero then sign must be positive */ 905 res = bld->one; 906 } 907 else if(type.floating) { 908 LLVMTypeRef vec_type; 909 LLVMTypeRef int_type; 910 LLVMValueRef mask; 911 LLVMValueRef sign; 912 LLVMValueRef one; 913 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 914 915 int_type = lp_build_int_vec_type(bld->gallivm, type); 916 vec_type = lp_build_vec_type(bld->gallivm, type); 917 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 918 919 /* Take the sign bit and add it to 1 constant */ 920 sign = LLVMBuildBitCast(builder, a, int_type, ""); 921 sign = LLVMBuildAnd(builder, sign, mask, ""); 922 one = LLVMConstBitCast(bld->one, int_type); 923 res = LLVMBuildOr(builder, sign, one, ""); 924 res = LLVMBuildBitCast(builder, res, vec_type, ""); 925 } 926 else 927 { 928 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 929 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 930 res = lp_build_select(bld, cond, bld->one, minus_one); 931 } 932 933 /* Handle zero */ 934 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 935 res = lp_build_select(bld, cond, bld->zero, res); 936 937 return res; 938} 939 940 941/** 942 * Set the sign of float vector 'a' according to 'sign'. 943 * If sign==0, return abs(a). 944 * If sign==1, return -abs(a); 945 * Other values for sign produce undefined results. 946 */ 947LLVMValueRef 948lp_build_set_sign(struct lp_build_context *bld, 949 LLVMValueRef a, LLVMValueRef sign) 950{ 951 LLVMBuilderRef builder = bld->gallivm->builder; 952 const struct lp_type type = bld->type; 953 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 954 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 955 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 956 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 957 ~((unsigned long long) 1 << (type.width - 1))); 958 LLVMValueRef val, res; 959 960 assert(type.floating); 961 assert(lp_check_value(type, a)); 962 963 /* val = reinterpret_cast<int>(a) */ 964 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 965 /* val = val & mask */ 966 val = LLVMBuildAnd(builder, val, mask, ""); 967 /* sign = sign << shift */ 968 sign = LLVMBuildShl(builder, sign, shift, ""); 969 /* res = val | sign */ 970 res = LLVMBuildOr(builder, val, sign, ""); 971 /* res = reinterpret_cast<float>(res) */ 972 res = LLVMBuildBitCast(builder, res, vec_type, ""); 973 974 return res; 975} 976 977 978/** 979 * Convert vector of (or scalar) int to vector of (or scalar) float. 980 */ 981LLVMValueRef 982lp_build_int_to_float(struct lp_build_context *bld, 983 LLVMValueRef a) 984{ 985 LLVMBuilderRef builder = bld->gallivm->builder; 986 const struct lp_type type = bld->type; 987 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 988 989 assert(type.floating); 990 991 return LLVMBuildSIToFP(builder, a, vec_type, ""); 992} 993 994 995 996enum lp_build_round_sse41_mode 997{ 998 LP_BUILD_ROUND_SSE41_NEAREST = 0, 999 LP_BUILD_ROUND_SSE41_FLOOR = 1, 1000 LP_BUILD_ROUND_SSE41_CEIL = 2, 1001 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 1002}; 1003 1004 1005/** 1006 * Helper for SSE4.1's ROUNDxx instructions. 1007 * 1008 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the 1009 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. 1010 */ 1011static INLINE LLVMValueRef 1012lp_build_round_sse41(struct lp_build_context *bld, 1013 LLVMValueRef a, 1014 enum lp_build_round_sse41_mode mode) 1015{ 1016 LLVMBuilderRef builder = bld->gallivm->builder; 1017 const struct lp_type type = bld->type; 1018 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1019 const char *intrinsic; 1020 LLVMValueRef res; 1021 1022 assert(type.floating); 1023 1024 assert(lp_check_value(type, a)); 1025 assert(util_cpu_caps.has_sse4_1); 1026 1027 if (type.length == 1) { 1028 LLVMTypeRef vec_type; 1029 LLVMValueRef undef; 1030 LLVMValueRef args[3]; 1031 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1032 1033 switch(type.width) { 1034 case 32: 1035 intrinsic = "llvm.x86.sse41.round.ss"; 1036 break; 1037 case 64: 1038 intrinsic = "llvm.x86.sse41.round.sd"; 1039 break; 1040 default: 1041 assert(0); 1042 return bld->undef; 1043 } 1044 1045 vec_type = LLVMVectorType(bld->elem_type, 4); 1046 1047 undef = LLVMGetUndef(vec_type); 1048 1049 args[0] = undef; 1050 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1051 args[2] = LLVMConstInt(i32t, mode, 0); 1052 1053 res = lp_build_intrinsic(builder, intrinsic, 1054 vec_type, args, Elements(args)); 1055 1056 res = LLVMBuildExtractElement(builder, res, index0, ""); 1057 } 1058 else { 1059 assert(type.width*type.length == 128); 1060 1061 switch(type.width) { 1062 case 32: 1063 intrinsic = "llvm.x86.sse41.round.ps"; 1064 break; 1065 case 64: 1066 intrinsic = "llvm.x86.sse41.round.pd"; 1067 break; 1068 default: 1069 assert(0); 1070 return bld->undef; 1071 } 1072 1073 res = lp_build_intrinsic_binary(builder, intrinsic, 1074 bld->vec_type, a, 1075 LLVMConstInt(i32t, mode, 0)); 1076 } 1077 1078 return res; 1079} 1080 1081 1082static INLINE LLVMValueRef 1083lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1084 LLVMValueRef a) 1085{ 1086 LLVMBuilderRef builder = bld->gallivm->builder; 1087 const struct lp_type type = bld->type; 1088 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1089 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1090 const char *intrinsic; 1091 LLVMValueRef res; 1092 1093 assert(type.floating); 1094 /* using the double precision conversions is a bit more complicated */ 1095 assert(type.width == 32); 1096 1097 assert(lp_check_value(type, a)); 1098 assert(util_cpu_caps.has_sse2); 1099 1100 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1101 if (type.length == 1) { 1102 LLVMTypeRef vec_type; 1103 LLVMValueRef undef; 1104 LLVMValueRef arg; 1105 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1106 1107 vec_type = LLVMVectorType(bld->elem_type, 4); 1108 1109 intrinsic = "llvm.x86.sse.cvtss2si"; 1110 1111 undef = LLVMGetUndef(vec_type); 1112 1113 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1114 1115 res = lp_build_intrinsic_unary(builder, intrinsic, 1116 ret_type, arg); 1117 } 1118 else { 1119 assert(type.width*type.length == 128); 1120 1121 intrinsic = "llvm.x86.sse2.cvtps2dq"; 1122 1123 res = lp_build_intrinsic_unary(builder, intrinsic, 1124 ret_type, a); 1125 } 1126 1127 return res; 1128} 1129 1130 1131/** 1132 * Return the integer part of a float (vector) value (== round toward zero). 1133 * The returned value is a float (vector). 1134 * Ex: trunc(-1.5) = -1.0 1135 */ 1136LLVMValueRef 1137lp_build_trunc(struct lp_build_context *bld, 1138 LLVMValueRef a) 1139{ 1140 LLVMBuilderRef builder = bld->gallivm->builder; 1141 const struct lp_type type = bld->type; 1142 1143 assert(type.floating); 1144 assert(lp_check_value(type, a)); 1145 1146 if (util_cpu_caps.has_sse4_1 && 1147 (type.length == 1 || type.width*type.length == 128)) { 1148 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 1149 } 1150 else { 1151 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1152 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1153 LLVMValueRef res; 1154 res = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1155 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1156 return res; 1157 } 1158} 1159 1160 1161/** 1162 * Return float (vector) rounded to nearest integer (vector). The returned 1163 * value is a float (vector). 1164 * Ex: round(0.9) = 1.0 1165 * Ex: round(-1.5) = -2.0 1166 */ 1167LLVMValueRef 1168lp_build_round(struct lp_build_context *bld, 1169 LLVMValueRef a) 1170{ 1171 LLVMBuilderRef builder = bld->gallivm->builder; 1172 const struct lp_type type = bld->type; 1173 1174 assert(type.floating); 1175 assert(lp_check_value(type, a)); 1176 1177 if (util_cpu_caps.has_sse4_1 && 1178 (type.length == 1 || type.width*type.length == 128)) { 1179 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1180 } 1181 else { 1182 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1183 LLVMValueRef res; 1184 res = lp_build_iround(bld, a); 1185 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1186 return res; 1187 } 1188} 1189 1190 1191/** 1192 * Return floor of float (vector), result is a float (vector) 1193 * Ex: floor(1.1) = 1.0 1194 * Ex: floor(-1.1) = -2.0 1195 */ 1196LLVMValueRef 1197lp_build_floor(struct lp_build_context *bld, 1198 LLVMValueRef a) 1199{ 1200 LLVMBuilderRef builder = bld->gallivm->builder; 1201 const struct lp_type type = bld->type; 1202 1203 assert(type.floating); 1204 assert(lp_check_value(type, a)); 1205 1206 if (util_cpu_caps.has_sse4_1 && 1207 (type.length == 1 || type.width*type.length == 128)) { 1208 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1209 } 1210 else { 1211 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1212 LLVMValueRef res; 1213 res = lp_build_ifloor(bld, a); 1214 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1215 return res; 1216 } 1217} 1218 1219 1220/** 1221 * Return ceiling of float (vector), returning float (vector). 1222 * Ex: ceil( 1.1) = 2.0 1223 * Ex: ceil(-1.1) = -1.0 1224 */ 1225LLVMValueRef 1226lp_build_ceil(struct lp_build_context *bld, 1227 LLVMValueRef a) 1228{ 1229 LLVMBuilderRef builder = bld->gallivm->builder; 1230 const struct lp_type type = bld->type; 1231 1232 assert(type.floating); 1233 assert(lp_check_value(type, a)); 1234 1235 if (util_cpu_caps.has_sse4_1 && 1236 (type.length == 1 || type.width*type.length == 128)) { 1237 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1238 } 1239 else { 1240 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1241 LLVMValueRef res; 1242 res = lp_build_iceil(bld, a); 1243 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1244 return res; 1245 } 1246} 1247 1248 1249/** 1250 * Return fractional part of 'a' computed as a - floor(a) 1251 * Typically used in texture coord arithmetic. 1252 */ 1253LLVMValueRef 1254lp_build_fract(struct lp_build_context *bld, 1255 LLVMValueRef a) 1256{ 1257 assert(bld->type.floating); 1258 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1259} 1260 1261 1262/** 1263 * Return the integer part of a float (vector) value (== round toward zero). 1264 * The returned value is an integer (vector). 1265 * Ex: itrunc(-1.5) = -1 1266 */ 1267LLVMValueRef 1268lp_build_itrunc(struct lp_build_context *bld, 1269 LLVMValueRef a) 1270{ 1271 LLVMBuilderRef builder = bld->gallivm->builder; 1272 const struct lp_type type = bld->type; 1273 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1274 1275 assert(type.floating); 1276 assert(lp_check_value(type, a)); 1277 1278 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1279} 1280 1281 1282/** 1283 * Return float (vector) rounded to nearest integer (vector). The returned 1284 * value is an integer (vector). 1285 * Ex: iround(0.9) = 1 1286 * Ex: iround(-1.5) = -2 1287 */ 1288LLVMValueRef 1289lp_build_iround(struct lp_build_context *bld, 1290 LLVMValueRef a) 1291{ 1292 LLVMBuilderRef builder = bld->gallivm->builder; 1293 const struct lp_type type = bld->type; 1294 LLVMTypeRef int_vec_type = bld->int_vec_type; 1295 LLVMValueRef res; 1296 1297 assert(type.floating); 1298 1299 assert(lp_check_value(type, a)); 1300 1301 if (util_cpu_caps.has_sse2 && 1302 ((type.width == 32) && (type.length == 1 || type.length == 4))) { 1303 return lp_build_iround_nearest_sse2(bld, a); 1304 } 1305 else if (util_cpu_caps.has_sse4_1 && 1306 (type.length == 1 || type.width*type.length == 128)) { 1307 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1308 } 1309 else { 1310 LLVMValueRef half; 1311 1312 half = lp_build_const_vec(bld->gallivm, type, 0.5); 1313 1314 if (type.sign) { 1315 LLVMTypeRef vec_type = bld->vec_type; 1316 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1317 (unsigned long long)1 << (type.width - 1)); 1318 LLVMValueRef sign; 1319 1320 /* get sign bit */ 1321 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1322 sign = LLVMBuildAnd(builder, sign, mask, ""); 1323 1324 /* sign * 0.5 */ 1325 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 1326 half = LLVMBuildOr(builder, sign, half, ""); 1327 half = LLVMBuildBitCast(builder, half, vec_type, ""); 1328 } 1329 1330 res = LLVMBuildFAdd(builder, a, half, ""); 1331 } 1332 1333 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 1334 1335 return res; 1336} 1337 1338 1339/** 1340 * Return floor of float (vector), result is an int (vector) 1341 * Ex: ifloor(1.1) = 1.0 1342 * Ex: ifloor(-1.1) = -2.0 1343 */ 1344LLVMValueRef 1345lp_build_ifloor(struct lp_build_context *bld, 1346 LLVMValueRef a) 1347{ 1348 LLVMBuilderRef builder = bld->gallivm->builder; 1349 const struct lp_type type = bld->type; 1350 LLVMTypeRef int_vec_type = bld->int_vec_type; 1351 LLVMValueRef res; 1352 1353 assert(type.floating); 1354 assert(lp_check_value(type, a)); 1355 1356 if (util_cpu_caps.has_sse4_1 && 1357 (type.length == 1 || type.width*type.length == 128)) { 1358 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1359 } 1360 else { 1361 res = a; 1362 1363 if (type.sign) { 1364 /* Take the sign bit and add it to 1 constant */ 1365 LLVMTypeRef vec_type = bld->vec_type; 1366 unsigned mantissa = lp_mantissa(type); 1367 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1368 (unsigned long long)1 << (type.width - 1)); 1369 LLVMValueRef sign; 1370 LLVMValueRef offset; 1371 1372 /* sign = a < 0 ? ~0 : 0 */ 1373 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1374 sign = LLVMBuildAnd(builder, sign, mask, ""); 1375 sign = LLVMBuildAShr(builder, sign, 1376 lp_build_const_int_vec(bld->gallivm, type, 1377 type.width - 1), 1378 "ifloor.sign"); 1379 1380 /* offset = -0.99999(9)f */ 1381 offset = lp_build_const_vec(bld->gallivm, type, 1382 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1383 offset = LLVMConstBitCast(offset, int_vec_type); 1384 1385 /* offset = a < 0 ? offset : 0.0f */ 1386 offset = LLVMBuildAnd(builder, offset, sign, ""); 1387 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset"); 1388 1389 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res"); 1390 } 1391 } 1392 1393 /* round to nearest (toward zero) */ 1394 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 1395 1396 return res; 1397} 1398 1399 1400/** 1401 * Return ceiling of float (vector), returning int (vector). 1402 * Ex: iceil( 1.1) = 2 1403 * Ex: iceil(-1.1) = -1 1404 */ 1405LLVMValueRef 1406lp_build_iceil(struct lp_build_context *bld, 1407 LLVMValueRef a) 1408{ 1409 LLVMBuilderRef builder = bld->gallivm->builder; 1410 const struct lp_type type = bld->type; 1411 LLVMTypeRef int_vec_type = bld->int_vec_type; 1412 LLVMValueRef res; 1413 1414 assert(type.floating); 1415 assert(lp_check_value(type, a)); 1416 1417 if (util_cpu_caps.has_sse4_1 && 1418 (type.length == 1 || type.width*type.length == 128)) { 1419 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1420 } 1421 else { 1422 LLVMTypeRef vec_type = bld->vec_type; 1423 unsigned mantissa = lp_mantissa(type); 1424 LLVMValueRef offset; 1425 1426 /* offset = 0.99999(9)f */ 1427 offset = lp_build_const_vec(bld->gallivm, type, 1428 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1429 1430 if (type.sign) { 1431 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1432 (unsigned long long)1 << (type.width - 1)); 1433 LLVMValueRef sign; 1434 1435 /* sign = a < 0 ? 0 : ~0 */ 1436 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1437 sign = LLVMBuildAnd(builder, sign, mask, ""); 1438 sign = LLVMBuildAShr(builder, sign, 1439 lp_build_const_int_vec(bld->gallivm, type, 1440 type.width - 1), 1441 "iceil.sign"); 1442 sign = LLVMBuildNot(builder, sign, "iceil.not"); 1443 1444 /* offset = a < 0 ? 0.0 : offset */ 1445 offset = LLVMConstBitCast(offset, int_vec_type); 1446 offset = LLVMBuildAnd(builder, offset, sign, ""); 1447 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset"); 1448 } 1449 1450 res = LLVMBuildFAdd(builder, a, offset, "iceil.res"); 1451 } 1452 1453 /* round to nearest (toward zero) */ 1454 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 1455 1456 return res; 1457} 1458 1459 1460/** 1461 * Combined ifloor() & fract(). 1462 * 1463 * Preferred to calling the functions separately, as it will ensure that the 1464 * stratergy (floor() vs ifloor()) that results in less redundant work is used. 1465 */ 1466void 1467lp_build_ifloor_fract(struct lp_build_context *bld, 1468 LLVMValueRef a, 1469 LLVMValueRef *out_ipart, 1470 LLVMValueRef *out_fpart) 1471{ 1472 LLVMBuilderRef builder = bld->gallivm->builder; 1473 const struct lp_type type = bld->type; 1474 LLVMValueRef ipart; 1475 1476 assert(type.floating); 1477 assert(lp_check_value(type, a)); 1478 1479 if (util_cpu_caps.has_sse4_1 && 1480 (type.length == 1 || type.width*type.length == 128)) { 1481 /* 1482 * floor() is easier. 1483 */ 1484 1485 ipart = lp_build_floor(bld, a); 1486 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1487 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 1488 } 1489 else { 1490 /* 1491 * ifloor() is easier. 1492 */ 1493 1494 *out_ipart = lp_build_ifloor(bld, a); 1495 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 1496 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1497 } 1498} 1499 1500 1501LLVMValueRef 1502lp_build_sqrt(struct lp_build_context *bld, 1503 LLVMValueRef a) 1504{ 1505 LLVMBuilderRef builder = bld->gallivm->builder; 1506 const struct lp_type type = bld->type; 1507 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1508 char intrinsic[32]; 1509 1510 assert(lp_check_value(type, a)); 1511 1512 /* TODO: optimize the constant case */ 1513 /* TODO: optimize the constant case */ 1514 1515 assert(type.floating); 1516 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1517 1518 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1519} 1520 1521 1522/** 1523 * Do one Newton-Raphson step to improve reciprocate precision: 1524 * 1525 * x_{i+1} = x_i * (2 - a * x_i) 1526 * 1527 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 1528 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 1529 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 1530 * halo. It would be necessary to clamp the argument to prevent this. 1531 * 1532 * See also: 1533 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 1534 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1535 */ 1536static INLINE LLVMValueRef 1537lp_build_rcp_refine(struct lp_build_context *bld, 1538 LLVMValueRef a, 1539 LLVMValueRef rcp_a) 1540{ 1541 LLVMBuilderRef builder = bld->gallivm->builder; 1542 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 1543 LLVMValueRef res; 1544 1545 res = LLVMBuildFMul(builder, a, rcp_a, ""); 1546 res = LLVMBuildFSub(builder, two, res, ""); 1547 res = LLVMBuildFMul(builder, rcp_a, res, ""); 1548 1549 return res; 1550} 1551 1552 1553LLVMValueRef 1554lp_build_rcp(struct lp_build_context *bld, 1555 LLVMValueRef a) 1556{ 1557 LLVMBuilderRef builder = bld->gallivm->builder; 1558 const struct lp_type type = bld->type; 1559 1560 assert(lp_check_value(type, a)); 1561 1562 if(a == bld->zero) 1563 return bld->undef; 1564 if(a == bld->one) 1565 return bld->one; 1566 if(a == bld->undef) 1567 return bld->undef; 1568 1569 assert(type.floating); 1570 1571 if(LLVMIsConstant(a)) 1572 return LLVMConstFDiv(bld->one, a); 1573 1574 /* 1575 * We don't use RCPPS because: 1576 * - it only has 10bits of precision 1577 * - it doesn't even get the reciprocate of 1.0 exactly 1578 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 1579 * - for recent processors the benefit over DIVPS is marginal, a case 1580 * depedent 1581 * 1582 * We could still use it on certain processors if benchmarks show that the 1583 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 1584 * particular uses that require less workarounds. 1585 */ 1586 1587 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1588 const unsigned num_iterations = 0; 1589 LLVMValueRef res; 1590 unsigned i; 1591 1592 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); 1593 1594 for (i = 0; i < num_iterations; ++i) { 1595 res = lp_build_rcp_refine(bld, a, res); 1596 } 1597 1598 return res; 1599 } 1600 1601 return LLVMBuildFDiv(builder, bld->one, a, ""); 1602} 1603 1604 1605/** 1606 * Do one Newton-Raphson step to improve rsqrt precision: 1607 * 1608 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 1609 * 1610 * See also: 1611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1612 */ 1613static INLINE LLVMValueRef 1614lp_build_rsqrt_refine(struct lp_build_context *bld, 1615 LLVMValueRef a, 1616 LLVMValueRef rsqrt_a) 1617{ 1618 LLVMBuilderRef builder = bld->gallivm->builder; 1619 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 1620 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 1621 LLVMValueRef res; 1622 1623 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 1624 res = LLVMBuildFMul(builder, a, res, ""); 1625 res = LLVMBuildFSub(builder, three, res, ""); 1626 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 1627 res = LLVMBuildFMul(builder, half, res, ""); 1628 1629 return res; 1630} 1631 1632 1633/** 1634 * Generate 1/sqrt(a) 1635 */ 1636LLVMValueRef 1637lp_build_rsqrt(struct lp_build_context *bld, 1638 LLVMValueRef a) 1639{ 1640 LLVMBuilderRef builder = bld->gallivm->builder; 1641 const struct lp_type type = bld->type; 1642 1643 assert(lp_check_value(type, a)); 1644 1645 assert(type.floating); 1646 1647 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1648 const unsigned num_iterations = 1; 1649 LLVMValueRef res; 1650 unsigned i; 1651 1652 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); 1653 1654 for (i = 0; i < num_iterations; ++i) { 1655 res = lp_build_rsqrt_refine(bld, a, res); 1656 } 1657 1658 return res; 1659 } 1660 1661 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1662} 1663 1664 1665static inline LLVMValueRef 1666lp_build_const_v4si(struct gallivm_state *gallivm, unsigned long value) 1667{ 1668 LLVMValueRef element = lp_build_const_int32(gallivm, value); 1669 LLVMValueRef elements[4] = { element, element, element, element }; 1670 return LLVMConstVector(elements, 4); 1671} 1672 1673static inline LLVMValueRef 1674lp_build_const_v4sf(struct gallivm_state *gallivm, float value) 1675{ 1676 LLVMValueRef element = lp_build_const_float(gallivm, value); 1677 LLVMValueRef elements[4] = { element, element, element, element }; 1678 return LLVMConstVector(elements, 4); 1679} 1680 1681 1682/** 1683 * Generate sin(a) using SSE2 1684 */ 1685LLVMValueRef 1686lp_build_sin(struct lp_build_context *bld, 1687 LLVMValueRef a) 1688{ 1689 LLVMBuilderRef builder = bld->gallivm->builder; 1690 struct gallivm_state *gallivm = bld->gallivm; 1691 struct lp_type int_type = lp_int_type(bld->type); 1692 LLVMBuilderRef b = builder; 1693 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(bld->gallivm->context), 4); 1694 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32TypeInContext(bld->gallivm->context), 4); 1695 1696 /* 1697 * take the absolute value, 1698 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1699 */ 1700 1701 LLVMValueRef inv_sig_mask = lp_build_const_v4si(bld->gallivm, ~0x80000000); 1702 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); 1703 1704 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1705 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); 1706 1707 /* 1708 * extract the sign bit (upper one) 1709 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 1710 */ 1711 LLVMValueRef sig_mask = lp_build_const_v4si(bld->gallivm, 0x80000000); 1712 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 1713 1714 /* 1715 * scale by 4/Pi 1716 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1717 */ 1718 1719 LLVMValueRef FOPi = lp_build_const_v4sf(gallivm, 1.27323954473516); 1720 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1721 1722 /* 1723 * store the integer part of y in mm0 1724 * emm2 = _mm_cvttps_epi32(y); 1725 */ 1726 1727 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); 1728 1729 /* 1730 * j=(j+1) & (~1) (see the cephes sources) 1731 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1732 */ 1733 1734 LLVMValueRef all_one = lp_build_const_v4si(bld->gallivm, 1); 1735 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1736 /* 1737 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1738 */ 1739 LLVMValueRef inv_one = lp_build_const_v4si(bld->gallivm, ~1); 1740 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1741 1742 /* 1743 * y = _mm_cvtepi32_ps(emm2); 1744 */ 1745 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); 1746 1747 /* get the swap sign flag 1748 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 1749 */ 1750 LLVMValueRef pi32_4 = lp_build_const_v4si(bld->gallivm, 4); 1751 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 1752 1753 /* 1754 * emm2 = _mm_slli_epi32(emm0, 29); 1755 */ 1756 LLVMValueRef const_29 = lp_build_const_v4si(bld->gallivm, 29); 1757 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 1758 1759 /* 1760 * get the polynom selection mask 1761 * there is one polynom for 0 <= x <= Pi/4 1762 * and another one for Pi/4<x<=Pi/2 1763 * Both branches will be computed. 1764 * 1765 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1766 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1767 */ 1768 1769 LLVMValueRef pi32_2 = lp_build_const_v4si(bld->gallivm, 2); 1770 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 1771 LLVMValueRef poly_mask = lp_build_compare(bld->gallivm, 1772 int_type, PIPE_FUNC_EQUAL, 1773 emm2_3, lp_build_const_v4si(bld->gallivm, 0)); 1774 /* 1775 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 1776 */ 1777 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 1778 1779 /* 1780 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1781 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1782 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1783 */ 1784 LLVMValueRef DP1 = lp_build_const_v4sf(gallivm, -0.78515625); 1785 LLVMValueRef DP2 = lp_build_const_v4sf(gallivm, -2.4187564849853515625e-4); 1786 LLVMValueRef DP3 = lp_build_const_v4sf(gallivm, -3.77489497744594108e-8); 1787 1788 /* 1789 * The magic pass: "Extended precision modular arithmetic" 1790 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1791 * xmm1 = _mm_mul_ps(y, xmm1); 1792 * xmm2 = _mm_mul_ps(y, xmm2); 1793 * xmm3 = _mm_mul_ps(y, xmm3); 1794 */ 1795 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1796 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1797 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1798 1799 /* 1800 * x = _mm_add_ps(x, xmm1); 1801 * x = _mm_add_ps(x, xmm2); 1802 * x = _mm_add_ps(x, xmm3); 1803 */ 1804 1805 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 1806 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 1807 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 1808 1809 /* 1810 * Evaluate the first polynom (0 <= x <= Pi/4) 1811 * 1812 * z = _mm_mul_ps(x,x); 1813 */ 1814 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 1815 1816 /* 1817 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1818 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1819 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1820 */ 1821 LLVMValueRef coscof_p0 = lp_build_const_v4sf(gallivm, 2.443315711809948E-005); 1822 LLVMValueRef coscof_p1 = lp_build_const_v4sf(gallivm, -1.388731625493765E-003); 1823 LLVMValueRef coscof_p2 = lp_build_const_v4sf(gallivm, 4.166664568298827E-002); 1824 1825 /* 1826 * y = *(v4sf*)_ps_coscof_p0; 1827 * y = _mm_mul_ps(y, z); 1828 */ 1829 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 1830 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 1831 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 1832 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 1833 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 1834 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 1835 1836 1837 /* 1838 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1839 * y = _mm_sub_ps(y, tmp); 1840 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1841 */ 1842 LLVMValueRef half = lp_build_const_v4sf(gallivm, 0.5); 1843 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 1844 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 1845 LLVMValueRef one = lp_build_const_v4sf(gallivm, 1.0); 1846 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 1847 1848 /* 1849 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1850 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1851 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1852 */ 1853 LLVMValueRef sincof_p0 = lp_build_const_v4sf(gallivm, -1.9515295891E-4); 1854 LLVMValueRef sincof_p1 = lp_build_const_v4sf(gallivm, 8.3321608736E-3); 1855 LLVMValueRef sincof_p2 = lp_build_const_v4sf(gallivm, -1.6666654611E-1); 1856 1857 /* 1858 * Evaluate the second polynom (Pi/4 <= x <= 0) 1859 * 1860 * y2 = *(v4sf*)_ps_sincof_p0; 1861 * y2 = _mm_mul_ps(y2, z); 1862 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1863 * y2 = _mm_mul_ps(y2, z); 1864 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1865 * y2 = _mm_mul_ps(y2, z); 1866 * y2 = _mm_mul_ps(y2, x); 1867 * y2 = _mm_add_ps(y2, x); 1868 */ 1869 1870 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 1871 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 1872 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 1873 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 1874 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 1875 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 1876 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 1877 1878 /* 1879 * select the correct result from the two polynoms 1880 * xmm3 = poly_mask; 1881 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1882 * y = _mm_andnot_ps(xmm3, y); 1883 * y = _mm_add_ps(y,y2); 1884 */ 1885 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); 1886 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); 1887 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1888 LLVMValueRef inv = lp_build_const_v4si(bld->gallivm, ~0); 1889 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1890 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1891 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1892 1893 /* 1894 * update the sign 1895 * y = _mm_xor_ps(y, sign_bit); 1896 */ 1897 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 1898 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); 1899 return y_result; 1900} 1901 1902 1903/** 1904 * Generate cos(a) using SSE2 1905 */ 1906LLVMValueRef 1907lp_build_cos(struct lp_build_context *bld, 1908 LLVMValueRef a) 1909{ 1910 LLVMBuilderRef builder = bld->gallivm->builder; 1911 struct gallivm_state *gallivm = bld->gallivm; 1912 struct lp_type int_type = lp_int_type(bld->type); 1913 LLVMBuilderRef b = builder; 1914 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(bld->gallivm->context), 4); 1915 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32TypeInContext(bld->gallivm->context), 4); 1916 1917 /* 1918 * take the absolute value, 1919 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1920 */ 1921 1922 LLVMValueRef inv_sig_mask = lp_build_const_v4si(bld->gallivm, ~0x80000000); 1923 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); 1924 1925 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1926 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); 1927 1928 /* 1929 * scale by 4/Pi 1930 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1931 */ 1932 1933 LLVMValueRef FOPi = lp_build_const_v4sf(gallivm, 1.27323954473516); 1934 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1935 1936 /* 1937 * store the integer part of y in mm0 1938 * emm2 = _mm_cvttps_epi32(y); 1939 */ 1940 1941 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); 1942 1943 /* 1944 * j=(j+1) & (~1) (see the cephes sources) 1945 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1946 */ 1947 1948 LLVMValueRef all_one = lp_build_const_v4si(bld->gallivm, 1); 1949 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1950 /* 1951 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1952 */ 1953 LLVMValueRef inv_one = lp_build_const_v4si(bld->gallivm, ~1); 1954 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1955 1956 /* 1957 * y = _mm_cvtepi32_ps(emm2); 1958 */ 1959 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); 1960 1961 1962 /* 1963 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 1964 */ 1965 LLVMValueRef const_2 = lp_build_const_v4si(bld->gallivm, 2); 1966 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 1967 1968 1969 /* get the swap sign flag 1970 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 1971 */ 1972 LLVMValueRef inv = lp_build_const_v4si(bld->gallivm, ~0); 1973 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 1974 LLVMValueRef pi32_4 = lp_build_const_v4si(bld->gallivm, 4); 1975 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 1976 1977 /* 1978 * emm2 = _mm_slli_epi32(emm0, 29); 1979 */ 1980 LLVMValueRef const_29 = lp_build_const_v4si(bld->gallivm, 29); 1981 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 1982 1983 /* 1984 * get the polynom selection mask 1985 * there is one polynom for 0 <= x <= Pi/4 1986 * and another one for Pi/4<x<=Pi/2 1987 * Both branches will be computed. 1988 * 1989 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1990 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1991 */ 1992 1993 LLVMValueRef pi32_2 = lp_build_const_v4si(bld->gallivm, 2); 1994 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 1995 LLVMValueRef poly_mask = lp_build_compare(bld->gallivm, 1996 int_type, PIPE_FUNC_EQUAL, 1997 emm2_3, lp_build_const_v4si(bld->gallivm, 0)); 1998 1999 /* 2000 * _PS_CONST(minus_cephes_DP1, -0.78515625); 2001 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2002 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2003 */ 2004 LLVMValueRef DP1 = lp_build_const_v4sf(gallivm, -0.78515625); 2005 LLVMValueRef DP2 = lp_build_const_v4sf(gallivm, -2.4187564849853515625e-4); 2006 LLVMValueRef DP3 = lp_build_const_v4sf(gallivm, -3.77489497744594108e-8); 2007 2008 /* 2009 * The magic pass: "Extended precision modular arithmetic" 2010 * x = ((x - y * DP1) - y * DP2) - y * DP3; 2011 * xmm1 = _mm_mul_ps(y, xmm1); 2012 * xmm2 = _mm_mul_ps(y, xmm2); 2013 * xmm3 = _mm_mul_ps(y, xmm3); 2014 */ 2015 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 2016 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 2017 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 2018 2019 /* 2020 * x = _mm_add_ps(x, xmm1); 2021 * x = _mm_add_ps(x, xmm2); 2022 * x = _mm_add_ps(x, xmm3); 2023 */ 2024 2025 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2026 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2027 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2028 2029 /* 2030 * Evaluate the first polynom (0 <= x <= Pi/4) 2031 * 2032 * z = _mm_mul_ps(x,x); 2033 */ 2034 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2035 2036 /* 2037 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2038 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2039 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2040 */ 2041 LLVMValueRef coscof_p0 = lp_build_const_v4sf(gallivm, 2.443315711809948E-005); 2042 LLVMValueRef coscof_p1 = lp_build_const_v4sf(gallivm, -1.388731625493765E-003); 2043 LLVMValueRef coscof_p2 = lp_build_const_v4sf(gallivm, 4.166664568298827E-002); 2044 2045 /* 2046 * y = *(v4sf*)_ps_coscof_p0; 2047 * y = _mm_mul_ps(y, z); 2048 */ 2049 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2050 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2051 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2052 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2053 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2054 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2055 2056 2057 /* 2058 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2059 * y = _mm_sub_ps(y, tmp); 2060 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2061 */ 2062 LLVMValueRef half = lp_build_const_v4sf(gallivm, 0.5); 2063 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2064 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2065 LLVMValueRef one = lp_build_const_v4sf(gallivm, 1.0); 2066 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2067 2068 /* 2069 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2070 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2071 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2072 */ 2073 LLVMValueRef sincof_p0 = lp_build_const_v4sf(gallivm, -1.9515295891E-4); 2074 LLVMValueRef sincof_p1 = lp_build_const_v4sf(gallivm, 8.3321608736E-3); 2075 LLVMValueRef sincof_p2 = lp_build_const_v4sf(gallivm, -1.6666654611E-1); 2076 2077 /* 2078 * Evaluate the second polynom (Pi/4 <= x <= 0) 2079 * 2080 * y2 = *(v4sf*)_ps_sincof_p0; 2081 * y2 = _mm_mul_ps(y2, z); 2082 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2083 * y2 = _mm_mul_ps(y2, z); 2084 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2085 * y2 = _mm_mul_ps(y2, z); 2086 * y2 = _mm_mul_ps(y2, x); 2087 * y2 = _mm_add_ps(y2, x); 2088 */ 2089 2090 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2091 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2092 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2093 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2094 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2095 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2096 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2097 2098 /* 2099 * select the correct result from the two polynoms 2100 * xmm3 = poly_mask; 2101 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2102 * y = _mm_andnot_ps(xmm3, y); 2103 * y = _mm_add_ps(y,y2); 2104 */ 2105 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); 2106 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); 2107 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2108 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2109 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2110 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2111 2112 /* 2113 * update the sign 2114 * y = _mm_xor_ps(y, sign_bit); 2115 */ 2116 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 2117 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); 2118 return y_result; 2119} 2120 2121 2122/** 2123 * Generate pow(x, y) 2124 */ 2125LLVMValueRef 2126lp_build_pow(struct lp_build_context *bld, 2127 LLVMValueRef x, 2128 LLVMValueRef y) 2129{ 2130 /* TODO: optimize the constant case */ 2131 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2132 LLVMIsConstant(x) && LLVMIsConstant(y)) { 2133 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2134 __FUNCTION__); 2135 } 2136 2137 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 2138} 2139 2140 2141/** 2142 * Generate exp(x) 2143 */ 2144LLVMValueRef 2145lp_build_exp(struct lp_build_context *bld, 2146 LLVMValueRef x) 2147{ 2148 /* log2(e) = 1/log(2) */ 2149 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 2150 1.4426950408889634); 2151 2152 assert(lp_check_value(bld->type, x)); 2153 2154 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 2155} 2156 2157 2158/** 2159 * Generate log(x) 2160 */ 2161LLVMValueRef 2162lp_build_log(struct lp_build_context *bld, 2163 LLVMValueRef x) 2164{ 2165 /* log(2) */ 2166 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 2167 0.69314718055994529); 2168 2169 assert(lp_check_value(bld->type, x)); 2170 2171 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 2172} 2173 2174 2175/** 2176 * Generate polynomial. 2177 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 2178 */ 2179static LLVMValueRef 2180lp_build_polynomial(struct lp_build_context *bld, 2181 LLVMValueRef x, 2182 const double *coeffs, 2183 unsigned num_coeffs) 2184{ 2185 const struct lp_type type = bld->type; 2186 LLVMValueRef res = NULL; 2187 unsigned i; 2188 2189 assert(lp_check_value(bld->type, x)); 2190 2191 /* TODO: optimize the constant case */ 2192 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2193 LLVMIsConstant(x)) { 2194 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2195 __FUNCTION__); 2196 } 2197 2198 for (i = num_coeffs; i--; ) { 2199 LLVMValueRef coeff; 2200 2201 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 2202 2203 if(res) 2204 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); 2205 else 2206 res = coeff; 2207 } 2208 2209 if(res) 2210 return res; 2211 else 2212 return bld->undef; 2213} 2214 2215 2216/** 2217 * Minimax polynomial fit of 2**x, in range [0, 1[ 2218 */ 2219const double lp_build_exp2_polynomial[] = { 2220#if EXP_POLY_DEGREE == 5 2221 0.999999925063526176901, 2222 0.693153073200168932794, 2223 0.240153617044375388211, 2224 0.0558263180532956664775, 2225 0.00898934009049466391101, 2226 0.00187757667519147912699 2227#elif EXP_POLY_DEGREE == 4 2228 1.00000259337069434683, 2229 0.693003834469974940458, 2230 0.24144275689150793076, 2231 0.0520114606103070150235, 2232 0.0135341679161270268764 2233#elif EXP_POLY_DEGREE == 3 2234 0.999925218562710312959, 2235 0.695833540494823811697, 2236 0.226067155427249155588, 2237 0.0780245226406372992967 2238#elif EXP_POLY_DEGREE == 2 2239 1.00172476321474503578, 2240 0.657636275736077639316, 2241 0.33718943461968720704 2242#else 2243#error 2244#endif 2245}; 2246 2247 2248void 2249lp_build_exp2_approx(struct lp_build_context *bld, 2250 LLVMValueRef x, 2251 LLVMValueRef *p_exp2_int_part, 2252 LLVMValueRef *p_frac_part, 2253 LLVMValueRef *p_exp2) 2254{ 2255 LLVMBuilderRef builder = bld->gallivm->builder; 2256 const struct lp_type type = bld->type; 2257 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2258 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2259 LLVMValueRef ipart = NULL; 2260 LLVMValueRef fpart = NULL; 2261 LLVMValueRef expipart = NULL; 2262 LLVMValueRef expfpart = NULL; 2263 LLVMValueRef res = NULL; 2264 2265 assert(lp_check_value(bld->type, x)); 2266 2267 if(p_exp2_int_part || p_frac_part || p_exp2) { 2268 /* TODO: optimize the constant case */ 2269 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2270 LLVMIsConstant(x)) { 2271 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2272 __FUNCTION__); 2273 } 2274 2275 assert(type.floating && type.width == 32); 2276 2277 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0)); 2278 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999)); 2279 2280 /* ipart = floor(x) */ 2281 ipart = lp_build_floor(bld, x); 2282 2283 /* fpart = x - ipart */ 2284 fpart = LLVMBuildFSub(builder, x, ipart, ""); 2285 } 2286 2287 if(p_exp2_int_part || p_exp2) { 2288 /* expipart = (float) (1 << ipart) */ 2289 ipart = LLVMBuildFPToSI(builder, ipart, int_vec_type, ""); 2290 expipart = LLVMBuildAdd(builder, ipart, 2291 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2292 expipart = LLVMBuildShl(builder, expipart, 2293 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2294 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 2295 } 2296 2297 if(p_exp2) { 2298 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 2299 Elements(lp_build_exp2_polynomial)); 2300 2301 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 2302 } 2303 2304 if(p_exp2_int_part) 2305 *p_exp2_int_part = expipart; 2306 2307 if(p_frac_part) 2308 *p_frac_part = fpart; 2309 2310 if(p_exp2) 2311 *p_exp2 = res; 2312} 2313 2314 2315LLVMValueRef 2316lp_build_exp2(struct lp_build_context *bld, 2317 LLVMValueRef x) 2318{ 2319 LLVMValueRef res; 2320 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 2321 return res; 2322} 2323 2324 2325/** 2326 * Extract the exponent of a IEEE-754 floating point value. 2327 * 2328 * Optionally apply an integer bias. 2329 * 2330 * Result is an integer value with 2331 * 2332 * ifloor(log2(x)) + bias 2333 */ 2334LLVMValueRef 2335lp_build_extract_exponent(struct lp_build_context *bld, 2336 LLVMValueRef x, 2337 int bias) 2338{ 2339 LLVMBuilderRef builder = bld->gallivm->builder; 2340 const struct lp_type type = bld->type; 2341 unsigned mantissa = lp_mantissa(type); 2342 LLVMValueRef res; 2343 2344 assert(type.floating); 2345 2346 assert(lp_check_value(bld->type, x)); 2347 2348 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2349 2350 res = LLVMBuildLShr(builder, x, 2351 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 2352 res = LLVMBuildAnd(builder, res, 2353 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 2354 res = LLVMBuildSub(builder, res, 2355 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 2356 2357 return res; 2358} 2359 2360 2361/** 2362 * Extract the mantissa of the a floating. 2363 * 2364 * Result is a floating point value with 2365 * 2366 * x / floor(log2(x)) 2367 */ 2368LLVMValueRef 2369lp_build_extract_mantissa(struct lp_build_context *bld, 2370 LLVMValueRef x) 2371{ 2372 LLVMBuilderRef builder = bld->gallivm->builder; 2373 const struct lp_type type = bld->type; 2374 unsigned mantissa = lp_mantissa(type); 2375 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 2376 (1ULL << mantissa) - 1); 2377 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 2378 LLVMValueRef res; 2379 2380 assert(lp_check_value(bld->type, x)); 2381 2382 assert(type.floating); 2383 2384 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2385 2386 /* res = x / 2**ipart */ 2387 res = LLVMBuildAnd(builder, x, mantmask, ""); 2388 res = LLVMBuildOr(builder, res, one, ""); 2389 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 2390 2391 return res; 2392} 2393 2394 2395 2396/** 2397 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 2398 * These coefficients can be generate with 2399 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 2400 */ 2401const double lp_build_log2_polynomial[] = { 2402#if LOG_POLY_DEGREE == 6 2403 3.11578814719469302614, 2404 -3.32419399085241980044, 2405 2.59883907202499966007, 2406 -1.23152682416275988241, 2407 0.318212422185251071475, 2408 -0.0344359067839062357313 2409#elif LOG_POLY_DEGREE == 5 2410 2.8882704548164776201, 2411 -2.52074962577807006663, 2412 1.48116647521213171641, 2413 -0.465725644288844778798, 2414 0.0596515482674574969533 2415#elif LOG_POLY_DEGREE == 4 2416 2.61761038894603480148, 2417 -1.75647175389045657003, 2418 0.688243882994381274313, 2419 -0.107254423828329604454 2420#elif LOG_POLY_DEGREE == 3 2421 2.28330284476918490682, 2422 -1.04913055217340124191, 2423 0.204446009836232697516 2424#else 2425#error 2426#endif 2427}; 2428 2429 2430/** 2431 * See http://www.devmaster.net/forums/showthread.php?p=43580 2432 */ 2433void 2434lp_build_log2_approx(struct lp_build_context *bld, 2435 LLVMValueRef x, 2436 LLVMValueRef *p_exp, 2437 LLVMValueRef *p_floor_log2, 2438 LLVMValueRef *p_log2) 2439{ 2440 LLVMBuilderRef builder = bld->gallivm->builder; 2441 const struct lp_type type = bld->type; 2442 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2443 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2444 2445 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 2446 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 2447 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 2448 2449 LLVMValueRef i = NULL; 2450 LLVMValueRef exp = NULL; 2451 LLVMValueRef mant = NULL; 2452 LLVMValueRef logexp = NULL; 2453 LLVMValueRef logmant = NULL; 2454 LLVMValueRef res = NULL; 2455 2456 assert(lp_check_value(bld->type, x)); 2457 2458 if(p_exp || p_floor_log2 || p_log2) { 2459 /* TODO: optimize the constant case */ 2460 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2461 LLVMIsConstant(x)) { 2462 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2463 __FUNCTION__); 2464 } 2465 2466 assert(type.floating && type.width == 32); 2467 2468 /* 2469 * We don't explicitly handle denormalized numbers. They will yield a 2470 * result in the neighbourhood of -127, which appears to be adequate 2471 * enough. 2472 */ 2473 2474 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 2475 2476 /* exp = (float) exponent(x) */ 2477 exp = LLVMBuildAnd(builder, i, expmask, ""); 2478 } 2479 2480 if(p_floor_log2 || p_log2) { 2481 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2482 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2483 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 2484 } 2485 2486 if(p_log2) { 2487 /* mant = (float) mantissa(x) */ 2488 mant = LLVMBuildAnd(builder, i, mantmask, ""); 2489 mant = LLVMBuildOr(builder, mant, one, ""); 2490 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 2491 2492 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, 2493 Elements(lp_build_log2_polynomial)); 2494 2495 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ 2496 logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), ""); 2497 2498 res = LLVMBuildFAdd(builder, logmant, logexp, ""); 2499 } 2500 2501 if(p_exp) { 2502 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 2503 *p_exp = exp; 2504 } 2505 2506 if(p_floor_log2) 2507 *p_floor_log2 = logexp; 2508 2509 if(p_log2) 2510 *p_log2 = res; 2511} 2512 2513 2514LLVMValueRef 2515lp_build_log2(struct lp_build_context *bld, 2516 LLVMValueRef x) 2517{ 2518 LLVMValueRef res; 2519 lp_build_log2_approx(bld, x, NULL, NULL, &res); 2520 return res; 2521} 2522 2523 2524/** 2525 * Faster (and less accurate) log2. 2526 * 2527 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 2528 * 2529 * Piece-wise linear approximation, with exact results when x is a 2530 * power of two. 2531 * 2532 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 2533 */ 2534LLVMValueRef 2535lp_build_fast_log2(struct lp_build_context *bld, 2536 LLVMValueRef x) 2537{ 2538 LLVMBuilderRef builder = bld->gallivm->builder; 2539 LLVMValueRef ipart; 2540 LLVMValueRef fpart; 2541 2542 assert(lp_check_value(bld->type, x)); 2543 2544 assert(bld->type.floating); 2545 2546 /* ipart = floor(log2(x)) - 1 */ 2547 ipart = lp_build_extract_exponent(bld, x, -1); 2548 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 2549 2550 /* fpart = x / 2**ipart */ 2551 fpart = lp_build_extract_mantissa(bld, x); 2552 2553 /* ipart + fpart */ 2554 return LLVMBuildFAdd(builder, ipart, fpart, ""); 2555} 2556 2557 2558/** 2559 * Fast implementation of iround(log2(x)). 2560 * 2561 * Not an approximation -- it should give accurate results all the time. 2562 */ 2563LLVMValueRef 2564lp_build_ilog2(struct lp_build_context *bld, 2565 LLVMValueRef x) 2566{ 2567 LLVMBuilderRef builder = bld->gallivm->builder; 2568 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 2569 LLVMValueRef ipart; 2570 2571 assert(bld->type.floating); 2572 2573 assert(lp_check_value(bld->type, x)); 2574 2575 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 2576 x = LLVMBuildFMul(builder, x, sqrt2, ""); 2577 2578 /* ipart = floor(log2(x) + 0.5) */ 2579 ipart = lp_build_extract_exponent(bld, x, 0); 2580 2581 return ipart; 2582} 2583