lp_bld_arit.c revision 533ec3f667d36ba2aea564ff047a8f55be13f6e9
1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include "util/u_memory.h" 49#include "util/u_debug.h" 50#include "util/u_math.h" 51#include "util/u_string.h" 52#include "util/u_cpu_detect.h" 53 54#include "lp_bld_type.h" 55#include "lp_bld_const.h" 56#include "lp_bld_intr.h" 57#include "lp_bld_init.h" /* for lp_build_engine */ 58#include "lp_bld_logic.h" 59#include "lp_bld_pack.h" 60#include "lp_bld_debug.h" 61#include "lp_bld_arit.h" 62 63 64/** 65 * Generate min(a, b) 66 * No checks for special case values of a or b = 1 or 0 are done. 67 */ 68static LLVMValueRef 69lp_build_min_simple(struct lp_build_context *bld, 70 LLVMValueRef a, 71 LLVMValueRef b) 72{ 73 const struct lp_type type = bld->type; 74 const char *intrinsic = NULL; 75 LLVMValueRef cond; 76 77 /* TODO: optimize the constant case */ 78 79 if(type.width * type.length == 128) { 80 if(type.floating) { 81 if(type.width == 32 && util_cpu_caps.has_sse) 82 intrinsic = "llvm.x86.sse.min.ps"; 83 if(type.width == 64 && util_cpu_caps.has_sse2) 84 intrinsic = "llvm.x86.sse2.min.pd"; 85 } 86 else { 87 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 88 intrinsic = "llvm.x86.sse2.pminu.b"; 89 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 90 intrinsic = "llvm.x86.sse41.pminsb"; 91 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 92 intrinsic = "llvm.x86.sse41.pminuw"; 93 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 94 intrinsic = "llvm.x86.sse2.pmins.w"; 95 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 96 intrinsic = "llvm.x86.sse41.pminud"; 97 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 98 intrinsic = "llvm.x86.sse41.pminsd"; 99 } 100 } 101 102 if(intrinsic) 103 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 104 105 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 106 return lp_build_select(bld, cond, a, b); 107} 108 109 110/** 111 * Generate max(a, b) 112 * No checks for special case values of a or b = 1 or 0 are done. 113 */ 114static LLVMValueRef 115lp_build_max_simple(struct lp_build_context *bld, 116 LLVMValueRef a, 117 LLVMValueRef b) 118{ 119 const struct lp_type type = bld->type; 120 const char *intrinsic = NULL; 121 LLVMValueRef cond; 122 123 /* TODO: optimize the constant case */ 124 125 if(type.width * type.length == 128) { 126 if(type.floating) { 127 if(type.width == 32 && util_cpu_caps.has_sse) 128 intrinsic = "llvm.x86.sse.max.ps"; 129 if(type.width == 64 && util_cpu_caps.has_sse2) 130 intrinsic = "llvm.x86.sse2.max.pd"; 131 } 132 else { 133 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 134 intrinsic = "llvm.x86.sse2.pmaxu.b"; 135 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 136 intrinsic = "llvm.x86.sse41.pmaxsb"; 137 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 138 intrinsic = "llvm.x86.sse41.pmaxuw"; 139 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 140 intrinsic = "llvm.x86.sse2.pmaxs.w"; 141 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 142 intrinsic = "llvm.x86.sse41.pmaxud"; 143 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 144 intrinsic = "llvm.x86.sse41.pmaxsd"; 145 } 146 } 147 148 if(intrinsic) 149 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 150 151 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 152 return lp_build_select(bld, cond, a, b); 153} 154 155 156/** 157 * Generate 1 - a, or ~a depending on bld->type. 158 */ 159LLVMValueRef 160lp_build_comp(struct lp_build_context *bld, 161 LLVMValueRef a) 162{ 163 const struct lp_type type = bld->type; 164 165 if(a == bld->one) 166 return bld->zero; 167 if(a == bld->zero) 168 return bld->one; 169 170 if(type.norm && !type.floating && !type.fixed && !type.sign) { 171 if(LLVMIsConstant(a)) 172 return LLVMConstNot(a); 173 else 174 return LLVMBuildNot(bld->builder, a, ""); 175 } 176 177 if(LLVMIsConstant(a)) 178 return LLVMConstSub(bld->one, a); 179 else 180 return LLVMBuildSub(bld->builder, bld->one, a, ""); 181} 182 183 184/** 185 * Generate a + b 186 */ 187LLVMValueRef 188lp_build_add(struct lp_build_context *bld, 189 LLVMValueRef a, 190 LLVMValueRef b) 191{ 192 const struct lp_type type = bld->type; 193 LLVMValueRef res; 194 195 if(a == bld->zero) 196 return b; 197 if(b == bld->zero) 198 return a; 199 if(a == bld->undef || b == bld->undef) 200 return bld->undef; 201 202 if(bld->type.norm) { 203 const char *intrinsic = NULL; 204 205 if(a == bld->one || b == bld->one) 206 return bld->one; 207 208 if(util_cpu_caps.has_sse2 && 209 type.width * type.length == 128 && 210 !type.floating && !type.fixed) { 211 if(type.width == 8) 212 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 213 if(type.width == 16) 214 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 215 } 216 217 if(intrinsic) 218 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 219 } 220 221 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 222 res = LLVMConstAdd(a, b); 223 else 224 res = LLVMBuildAdd(bld->builder, a, b, ""); 225 226 /* clamp to ceiling of 1.0 */ 227 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 228 res = lp_build_min_simple(bld, res, bld->one); 229 230 /* XXX clamp to floor of -1 or 0??? */ 231 232 return res; 233} 234 235 236/** Return the sum of the elements of a */ 237LLVMValueRef 238lp_build_sum_vector(struct lp_build_context *bld, 239 LLVMValueRef a) 240{ 241 const struct lp_type type = bld->type; 242 LLVMValueRef index, res; 243 unsigned i; 244 245 if (a == bld->zero) 246 return bld->zero; 247 if (a == bld->undef) 248 return bld->undef; 249 assert(type.length > 1); 250 251 assert(!bld->type.norm); 252 253 index = LLVMConstInt(LLVMInt32Type(), 0, 0); 254 res = LLVMBuildExtractElement(bld->builder, a, index, ""); 255 256 for (i = 1; i < type.length; i++) { 257 index = LLVMConstInt(LLVMInt32Type(), i, 0); 258 res = LLVMBuildAdd(bld->builder, res, 259 LLVMBuildExtractElement(bld->builder, a, index, ""), 260 ""); 261 } 262 263 return res; 264} 265 266 267/** 268 * Generate a - b 269 */ 270LLVMValueRef 271lp_build_sub(struct lp_build_context *bld, 272 LLVMValueRef a, 273 LLVMValueRef b) 274{ 275 const struct lp_type type = bld->type; 276 LLVMValueRef res; 277 278 if(b == bld->zero) 279 return a; 280 if(a == bld->undef || b == bld->undef) 281 return bld->undef; 282 if(a == b) 283 return bld->zero; 284 285 if(bld->type.norm) { 286 const char *intrinsic = NULL; 287 288 if(b == bld->one) 289 return bld->zero; 290 291 if(util_cpu_caps.has_sse2 && 292 type.width * type.length == 128 && 293 !type.floating && !type.fixed) { 294 if(type.width == 8) 295 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 296 if(type.width == 16) 297 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 298 } 299 300 if(intrinsic) 301 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 302 } 303 304 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 305 res = LLVMConstSub(a, b); 306 else 307 res = LLVMBuildSub(bld->builder, a, b, ""); 308 309 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 310 res = lp_build_max_simple(bld, res, bld->zero); 311 312 return res; 313} 314 315 316/** 317 * Normalized 8bit multiplication. 318 * 319 * - alpha plus one 320 * 321 * makes the following approximation to the division (Sree) 322 * 323 * a*b/255 ~= (a*(b + 1)) >> 256 324 * 325 * which is the fastest method that satisfies the following OpenGL criteria 326 * 327 * 0*0 = 0 and 255*255 = 255 328 * 329 * - geometric series 330 * 331 * takes the geometric series approximation to the division 332 * 333 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 334 * 335 * in this case just the first two terms to fit in 16bit arithmetic 336 * 337 * t/255 ~= (t + (t >> 8)) >> 8 338 * 339 * note that just by itself it doesn't satisfies the OpenGL criteria, as 340 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 341 * must be used 342 * 343 * - geometric series plus rounding 344 * 345 * when using a geometric series division instead of truncating the result 346 * use roundoff in the approximation (Jim Blinn) 347 * 348 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 349 * 350 * achieving the exact results 351 * 352 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 353 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 354 * @sa Michael Herf, The "double blend trick", May 2000, 355 * http://www.stereopsis.com/doubleblend.html 356 */ 357static LLVMValueRef 358lp_build_mul_u8n(LLVMBuilderRef builder, 359 struct lp_type i16_type, 360 LLVMValueRef a, LLVMValueRef b) 361{ 362 LLVMValueRef c8; 363 LLVMValueRef ab; 364 365 c8 = lp_build_const_int_vec(i16_type, 8); 366 367#if 0 368 369 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 370 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), ""); 371 ab = LLVMBuildMul(builder, a, b, ""); 372 373#else 374 375 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 376 ab = LLVMBuildMul(builder, a, b, ""); 377 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 378 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), ""); 379 380#endif 381 382 ab = LLVMBuildLShr(builder, ab, c8, ""); 383 384 return ab; 385} 386 387 388/** 389 * Generate a * b 390 */ 391LLVMValueRef 392lp_build_mul(struct lp_build_context *bld, 393 LLVMValueRef a, 394 LLVMValueRef b) 395{ 396 const struct lp_type type = bld->type; 397 LLVMValueRef shift; 398 LLVMValueRef res; 399 400 if(a == bld->zero) 401 return bld->zero; 402 if(a == bld->one) 403 return b; 404 if(b == bld->zero) 405 return bld->zero; 406 if(b == bld->one) 407 return a; 408 if(a == bld->undef || b == bld->undef) 409 return bld->undef; 410 411 if(!type.floating && !type.fixed && type.norm) { 412 if(type.width == 8) { 413 struct lp_type i16_type = lp_wider_type(type); 414 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 415 416 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah); 417 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh); 418 419 /* PMULLW, PSRLW, PADDW */ 420 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl); 421 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh); 422 423 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh); 424 425 return ab; 426 } 427 428 /* FIXME */ 429 assert(0); 430 } 431 432 if(type.fixed) 433 shift = lp_build_const_int_vec(type, type.width/2); 434 else 435 shift = NULL; 436 437 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 438 res = LLVMConstMul(a, b); 439 if(shift) { 440 if(type.sign) 441 res = LLVMConstAShr(res, shift); 442 else 443 res = LLVMConstLShr(res, shift); 444 } 445 } 446 else { 447 res = LLVMBuildMul(bld->builder, a, b, ""); 448 if(shift) { 449 if(type.sign) 450 res = LLVMBuildAShr(bld->builder, res, shift, ""); 451 else 452 res = LLVMBuildLShr(bld->builder, res, shift, ""); 453 } 454 } 455 456 return res; 457} 458 459 460/** 461 * Small vector x scale multiplication optimization. 462 */ 463LLVMValueRef 464lp_build_mul_imm(struct lp_build_context *bld, 465 LLVMValueRef a, 466 int b) 467{ 468 LLVMValueRef factor; 469 470 if(b == 0) 471 return bld->zero; 472 473 if(b == 1) 474 return a; 475 476 if(b == -1) 477 return LLVMBuildNeg(bld->builder, a, ""); 478 479 if(b == 2 && bld->type.floating) 480 return lp_build_add(bld, a, a); 481 482 if(util_is_pot(b)) { 483 unsigned shift = ffs(b) - 1; 484 485 if(bld->type.floating) { 486#if 0 487 /* 488 * Power of two multiplication by directly manipulating the mantissa. 489 * 490 * XXX: This might not be always faster, it will introduce a small error 491 * for multiplication by zero, and it will produce wrong results 492 * for Inf and NaN. 493 */ 494 unsigned mantissa = lp_mantissa(bld->type); 495 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa); 496 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), ""); 497 a = LLVMBuildAdd(bld->builder, a, factor, ""); 498 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), ""); 499 return a; 500#endif 501 } 502 else { 503 factor = lp_build_const_vec(bld->type, shift); 504 return LLVMBuildShl(bld->builder, a, factor, ""); 505 } 506 } 507 508 factor = lp_build_const_vec(bld->type, (double)b); 509 return lp_build_mul(bld, a, factor); 510} 511 512 513/** 514 * Generate a / b 515 */ 516LLVMValueRef 517lp_build_div(struct lp_build_context *bld, 518 LLVMValueRef a, 519 LLVMValueRef b) 520{ 521 const struct lp_type type = bld->type; 522 523 if(a == bld->zero) 524 return bld->zero; 525 if(a == bld->one) 526 return lp_build_rcp(bld, b); 527 if(b == bld->zero) 528 return bld->undef; 529 if(b == bld->one) 530 return a; 531 if(a == bld->undef || b == bld->undef) 532 return bld->undef; 533 534 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 535 return LLVMConstFDiv(a, b); 536 537 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 538 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 539 540 return LLVMBuildFDiv(bld->builder, a, b, ""); 541} 542 543 544/** 545 * Linear interpolation. 546 * 547 * This also works for integer values with a few caveats. 548 * 549 * @sa http://www.stereopsis.com/doubleblend.html 550 */ 551LLVMValueRef 552lp_build_lerp(struct lp_build_context *bld, 553 LLVMValueRef x, 554 LLVMValueRef v0, 555 LLVMValueRef v1) 556{ 557 LLVMValueRef delta; 558 LLVMValueRef res; 559 560 delta = lp_build_sub(bld, v1, v0); 561 562 res = lp_build_mul(bld, x, delta); 563 564 res = lp_build_add(bld, v0, res); 565 566 if(bld->type.fixed) 567 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 568 * but it will be wrong for other uses. Basically we need a more 569 * powerful lp_type, capable of further distinguishing the values 570 * interpretation from the value storage. */ 571 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), ""); 572 573 return res; 574} 575 576 577LLVMValueRef 578lp_build_lerp_2d(struct lp_build_context *bld, 579 LLVMValueRef x, 580 LLVMValueRef y, 581 LLVMValueRef v00, 582 LLVMValueRef v01, 583 LLVMValueRef v10, 584 LLVMValueRef v11) 585{ 586 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 587 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 588 return lp_build_lerp(bld, y, v0, v1); 589} 590 591 592/** 593 * Generate min(a, b) 594 * Do checks for special cases. 595 */ 596LLVMValueRef 597lp_build_min(struct lp_build_context *bld, 598 LLVMValueRef a, 599 LLVMValueRef b) 600{ 601 if(a == bld->undef || b == bld->undef) 602 return bld->undef; 603 604 if(a == b) 605 return a; 606 607 if(bld->type.norm) { 608 if(a == bld->zero || b == bld->zero) 609 return bld->zero; 610 if(a == bld->one) 611 return b; 612 if(b == bld->one) 613 return a; 614 } 615 616 return lp_build_min_simple(bld, a, b); 617} 618 619 620/** 621 * Generate max(a, b) 622 * Do checks for special cases. 623 */ 624LLVMValueRef 625lp_build_max(struct lp_build_context *bld, 626 LLVMValueRef a, 627 LLVMValueRef b) 628{ 629 if(a == bld->undef || b == bld->undef) 630 return bld->undef; 631 632 if(a == b) 633 return a; 634 635 if(bld->type.norm) { 636 if(a == bld->one || b == bld->one) 637 return bld->one; 638 if(a == bld->zero) 639 return b; 640 if(b == bld->zero) 641 return a; 642 } 643 644 return lp_build_max_simple(bld, a, b); 645} 646 647 648/** 649 * Generate clamp(a, min, max) 650 * Do checks for special cases. 651 */ 652LLVMValueRef 653lp_build_clamp(struct lp_build_context *bld, 654 LLVMValueRef a, 655 LLVMValueRef min, 656 LLVMValueRef max) 657{ 658 a = lp_build_min(bld, a, max); 659 a = lp_build_max(bld, a, min); 660 return a; 661} 662 663 664/** 665 * Generate abs(a) 666 */ 667LLVMValueRef 668lp_build_abs(struct lp_build_context *bld, 669 LLVMValueRef a) 670{ 671 const struct lp_type type = bld->type; 672 LLVMTypeRef vec_type = lp_build_vec_type(type); 673 674 if(!type.sign) 675 return a; 676 677 if(type.floating) { 678 /* Mask out the sign bit */ 679 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 680 unsigned long long absMask = ~(1ULL << (type.width - 1)); 681 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask)); 682 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 683 a = LLVMBuildAnd(bld->builder, a, mask, ""); 684 a = LLVMBuildBitCast(bld->builder, a, vec_type, ""); 685 return a; 686 } 687 688 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 689 switch(type.width) { 690 case 8: 691 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 692 case 16: 693 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 694 case 32: 695 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 696 } 697 } 698 699 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, "")); 700} 701 702 703LLVMValueRef 704lp_build_negate(struct lp_build_context *bld, 705 LLVMValueRef a) 706{ 707 return LLVMBuildNeg(bld->builder, a, ""); 708} 709 710 711/** Return -1, 0 or +1 depending on the sign of a */ 712LLVMValueRef 713lp_build_sgn(struct lp_build_context *bld, 714 LLVMValueRef a) 715{ 716 const struct lp_type type = bld->type; 717 LLVMValueRef cond; 718 LLVMValueRef res; 719 720 /* Handle non-zero case */ 721 if(!type.sign) { 722 /* if not zero then sign must be positive */ 723 res = bld->one; 724 } 725 else if(type.floating) { 726 LLVMTypeRef vec_type; 727 LLVMTypeRef int_type; 728 LLVMValueRef mask; 729 LLVMValueRef sign; 730 LLVMValueRef one; 731 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 732 733 int_type = lp_build_int_vec_type(type); 734 vec_type = lp_build_vec_type(type); 735 mask = lp_build_const_int_vec(type, maskBit); 736 737 /* Take the sign bit and add it to 1 constant */ 738 sign = LLVMBuildBitCast(bld->builder, a, int_type, ""); 739 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 740 one = LLVMConstBitCast(bld->one, int_type); 741 res = LLVMBuildOr(bld->builder, sign, one, ""); 742 res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); 743 } 744 else 745 { 746 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0); 747 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 748 res = lp_build_select(bld, cond, bld->one, minus_one); 749 } 750 751 /* Handle zero */ 752 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 753 res = lp_build_select(bld, cond, bld->zero, res); 754 755 return res; 756} 757 758 759/** 760 * Set the sign of float vector 'a' according to 'sign'. 761 * If sign==0, return abs(a). 762 * If sign==1, return -abs(a); 763 * Other values for sign produce undefined results. 764 */ 765LLVMValueRef 766lp_build_set_sign(struct lp_build_context *bld, 767 LLVMValueRef a, LLVMValueRef sign) 768{ 769 const struct lp_type type = bld->type; 770 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 771 LLVMTypeRef vec_type = lp_build_vec_type(type); 772 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1); 773 LLVMValueRef mask = lp_build_const_int_vec(type, 774 ~((unsigned long long) 1 << (type.width - 1))); 775 LLVMValueRef val, res; 776 777 assert(type.floating); 778 779 /* val = reinterpret_cast<int>(a) */ 780 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 781 /* val = val & mask */ 782 val = LLVMBuildAnd(bld->builder, val, mask, ""); 783 /* sign = sign << shift */ 784 sign = LLVMBuildShl(bld->builder, sign, shift, ""); 785 /* res = val | sign */ 786 res = LLVMBuildOr(bld->builder, val, sign, ""); 787 /* res = reinterpret_cast<float>(res) */ 788 res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); 789 790 return res; 791} 792 793 794/** 795 * Convert vector of (or scalar) int to vector of (or scalar) float. 796 */ 797LLVMValueRef 798lp_build_int_to_float(struct lp_build_context *bld, 799 LLVMValueRef a) 800{ 801 const struct lp_type type = bld->type; 802 LLVMTypeRef vec_type = lp_build_vec_type(type); 803 804 assert(type.floating); 805 806 return LLVMBuildSIToFP(bld->builder, a, vec_type, ""); 807} 808 809 810 811enum lp_build_round_sse41_mode 812{ 813 LP_BUILD_ROUND_SSE41_NEAREST = 0, 814 LP_BUILD_ROUND_SSE41_FLOOR = 1, 815 LP_BUILD_ROUND_SSE41_CEIL = 2, 816 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 817}; 818 819 820static INLINE LLVMValueRef 821lp_build_round_sse41(struct lp_build_context *bld, 822 LLVMValueRef a, 823 enum lp_build_round_sse41_mode mode) 824{ 825 const struct lp_type type = bld->type; 826 LLVMTypeRef vec_type = lp_build_vec_type(type); 827 const char *intrinsic; 828 829 assert(type.floating); 830 assert(type.width*type.length == 128); 831 assert(lp_check_value(type, a)); 832 assert(util_cpu_caps.has_sse4_1); 833 834 switch(type.width) { 835 case 32: 836 intrinsic = "llvm.x86.sse41.round.ps"; 837 break; 838 case 64: 839 intrinsic = "llvm.x86.sse41.round.pd"; 840 break; 841 default: 842 assert(0); 843 return bld->undef; 844 } 845 846 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, 847 LLVMConstInt(LLVMInt32Type(), mode, 0)); 848} 849 850 851LLVMValueRef 852lp_build_trunc(struct lp_build_context *bld, 853 LLVMValueRef a) 854{ 855 const struct lp_type type = bld->type; 856 857 assert(type.floating); 858 assert(lp_check_value(type, a)); 859 860 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 861 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 862 else { 863 LLVMTypeRef vec_type = lp_build_vec_type(type); 864 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 865 LLVMValueRef res; 866 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); 867 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 868 return res; 869 } 870} 871 872 873LLVMValueRef 874lp_build_round(struct lp_build_context *bld, 875 LLVMValueRef a) 876{ 877 const struct lp_type type = bld->type; 878 879 assert(type.floating); 880 assert(lp_check_value(type, a)); 881 882 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 883 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 884 else { 885 LLVMTypeRef vec_type = lp_build_vec_type(type); 886 LLVMValueRef res; 887 res = lp_build_iround(bld, a); 888 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 889 return res; 890 } 891} 892 893 894LLVMValueRef 895lp_build_floor(struct lp_build_context *bld, 896 LLVMValueRef a) 897{ 898 const struct lp_type type = bld->type; 899 900 assert(type.floating); 901 assert(lp_check_value(type, a)); 902 903 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 904 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 905 else { 906 LLVMTypeRef vec_type = lp_build_vec_type(type); 907 LLVMValueRef res; 908 res = lp_build_ifloor(bld, a); 909 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 910 return res; 911 } 912} 913 914 915LLVMValueRef 916lp_build_ceil(struct lp_build_context *bld, 917 LLVMValueRef a) 918{ 919 const struct lp_type type = bld->type; 920 921 assert(type.floating); 922 assert(lp_check_value(type, a)); 923 924 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 925 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 926 else { 927 LLVMTypeRef vec_type = lp_build_vec_type(type); 928 LLVMValueRef res; 929 res = lp_build_iceil(bld, a); 930 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 931 return res; 932 } 933} 934 935 936/** 937 * Return fractional part of 'a' computed as a - floor(f) 938 * Typically used in texture coord arithmetic. 939 */ 940LLVMValueRef 941lp_build_fract(struct lp_build_context *bld, 942 LLVMValueRef a) 943{ 944 assert(bld->type.floating); 945 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 946} 947 948 949/** 950 * Convert to integer, through whichever rounding method that's fastest, 951 * typically truncating toward zero. 952 */ 953LLVMValueRef 954lp_build_itrunc(struct lp_build_context *bld, 955 LLVMValueRef a) 956{ 957 const struct lp_type type = bld->type; 958 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 959 960 assert(type.floating); 961 assert(lp_check_value(type, a)); 962 963 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); 964} 965 966 967/** 968 * Convert float[] to int[] with round(). 969 */ 970LLVMValueRef 971lp_build_iround(struct lp_build_context *bld, 972 LLVMValueRef a) 973{ 974 const struct lp_type type = bld->type; 975 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 976 LLVMValueRef res; 977 978 assert(type.floating); 979 980 assert(lp_check_value(type, a)); 981 982 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 983 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 984 } 985 else { 986 LLVMTypeRef vec_type = lp_build_vec_type(type); 987 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 988 LLVMValueRef sign; 989 LLVMValueRef half; 990 991 /* get sign bit */ 992 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 993 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 994 995 /* sign * 0.5 */ 996 half = lp_build_const_vec(type, 0.5); 997 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); 998 half = LLVMBuildOr(bld->builder, sign, half, ""); 999 half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); 1000 1001 res = LLVMBuildAdd(bld->builder, a, half, ""); 1002 } 1003 1004 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1005 1006 return res; 1007} 1008 1009 1010/** 1011 * Convert float[] to int[] with floor(). 1012 */ 1013LLVMValueRef 1014lp_build_ifloor(struct lp_build_context *bld, 1015 LLVMValueRef a) 1016{ 1017 const struct lp_type type = bld->type; 1018 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1019 LLVMValueRef res; 1020 1021 assert(type.floating); 1022 assert(lp_check_value(type, a)); 1023 1024 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1025 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1026 } 1027 else { 1028 /* Take the sign bit and add it to 1 constant */ 1029 LLVMTypeRef vec_type = lp_build_vec_type(type); 1030 unsigned mantissa = lp_mantissa(type); 1031 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 1032 LLVMValueRef sign; 1033 LLVMValueRef offset; 1034 1035 /* sign = a < 0 ? ~0 : 0 */ 1036 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 1037 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 1038 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), ""); 1039 lp_build_name(sign, "floor.sign"); 1040 1041 /* offset = -0.99999(9)f */ 1042 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa)); 1043 offset = LLVMConstBitCast(offset, int_vec_type); 1044 1045 /* offset = a < 0 ? -0.99999(9)f : 0.0f */ 1046 offset = LLVMBuildAnd(bld->builder, offset, sign, ""); 1047 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, ""); 1048 lp_build_name(offset, "floor.offset"); 1049 1050 res = LLVMBuildAdd(bld->builder, a, offset, ""); 1051 lp_build_name(res, "floor.res"); 1052 } 1053 1054 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1055 lp_build_name(res, "floor"); 1056 1057 return res; 1058} 1059 1060 1061LLVMValueRef 1062lp_build_iceil(struct lp_build_context *bld, 1063 LLVMValueRef a) 1064{ 1065 const struct lp_type type = bld->type; 1066 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1067 LLVMValueRef res; 1068 1069 assert(type.floating); 1070 assert(lp_check_value(type, a)); 1071 1072 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1073 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1074 } 1075 else { 1076 /* TODO: mimic lp_build_ifloor() here */ 1077 assert(0); 1078 res = bld->undef; 1079 } 1080 1081 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1082 1083 return res; 1084} 1085 1086 1087LLVMValueRef 1088lp_build_sqrt(struct lp_build_context *bld, 1089 LLVMValueRef a) 1090{ 1091 const struct lp_type type = bld->type; 1092 LLVMTypeRef vec_type = lp_build_vec_type(type); 1093 char intrinsic[32]; 1094 1095 /* TODO: optimize the constant case */ 1096 /* TODO: optimize the constant case */ 1097 1098 assert(type.floating); 1099 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1100 1101 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); 1102} 1103 1104 1105LLVMValueRef 1106lp_build_rcp(struct lp_build_context *bld, 1107 LLVMValueRef a) 1108{ 1109 const struct lp_type type = bld->type; 1110 1111 if(a == bld->zero) 1112 return bld->undef; 1113 if(a == bld->one) 1114 return bld->one; 1115 if(a == bld->undef) 1116 return bld->undef; 1117 1118 assert(type.floating); 1119 1120 if(LLVMIsConstant(a)) 1121 return LLVMConstFDiv(bld->one, a); 1122 1123 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1124 /* 1125 * XXX: Added precision is not always necessary, so only enable this 1126 * when we have a better system in place to track minimum precision. 1127 */ 1128 1129#if 0 1130 /* 1131 * Do one Newton-Raphson step to improve precision: 1132 * 1133 * x1 = (2 - a * rcp(a)) * rcp(a) 1134 */ 1135 1136 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); 1137 LLVMValueRef rcp_a; 1138 LLVMValueRef res; 1139 1140 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); 1141 1142 res = LLVMBuildMul(bld->builder, a, rcp_a, ""); 1143 res = LLVMBuildSub(bld->builder, two, res, ""); 1144 res = LLVMBuildMul(bld->builder, res, rcp_a, ""); 1145 1146 return rcp_a; 1147#else 1148 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); 1149#endif 1150 } 1151 1152 return LLVMBuildFDiv(bld->builder, bld->one, a, ""); 1153} 1154 1155 1156/** 1157 * Generate 1/sqrt(a) 1158 */ 1159LLVMValueRef 1160lp_build_rsqrt(struct lp_build_context *bld, 1161 LLVMValueRef a) 1162{ 1163 const struct lp_type type = bld->type; 1164 1165 assert(type.floating); 1166 1167 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 1168 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a); 1169 1170 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1171} 1172 1173 1174#ifdef PIPE_OS_WINDOWS 1175 1176/* 1177 * XXX: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf() 1178 * which is neither efficient nor does the CRT linkage work on Windows 1179 * causing segmentation fault. 1180 * 1181 * XXX: With LLVM 2.7 both schemes cause an assertion failure. 1182 */ 1183static LLVMValueRef 1184lp_build_sincos(struct lp_build_context *bld, 1185 const char *name, 1186 float (*func)(float), 1187 LLVMValueRef a) 1188{ 1189 LLVMModuleRef module = 1190 LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld->builder))); 1191 LLVMValueRef function; 1192 LLVMValueRef res; 1193 unsigned i; 1194 1195 assert(bld->type.floating); 1196 assert(bld->type.width == 32); 1197 1198 function = LLVMGetNamedFunction(module, name); 1199 if (!function) { 1200 LLVMTypeRef ret_type; 1201 LLVMTypeRef arg_types[1]; 1202 LLVMTypeRef function_type; 1203 1204 ret_type = LLVMFloatType(); 1205 arg_types[0] = LLVMFloatType(); 1206 function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); 1207 function = LLVMAddFunction(module, name, function_type); 1208 1209 LLVMSetFunctionCallConv(function, LLVMCCallConv); 1210 LLVMSetLinkage(function, LLVMPrivateLinkage); 1211 1212 assert(LLVMIsDeclaration(function)); 1213 1214 LLVMAddGlobalMapping(lp_build_engine, function, func); 1215 } 1216 1217 res = bld->undef; 1218 1219 for (i = 0; i < bld->type.length; ++i) { 1220 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); 1221 LLVMValueRef args[1]; 1222 LLVMValueRef tmp; 1223 1224 args[0] = LLVMBuildExtractElement(bld->builder, a, index, ""); 1225 1226 tmp = LLVMBuildCall(bld->builder, function, args, Elements(args), ""); 1227 1228 res = LLVMBuildInsertElement(bld->builder, res, tmp, index, ""); 1229 } 1230 1231 return res; 1232} 1233 1234static float c_cosf( float f ) 1235{ 1236 return (float) cos( (double) f ); 1237} 1238 1239static float c_sinf( float f ) 1240{ 1241 return (float) sin( (double) f ); 1242} 1243 1244LLVMValueRef 1245lp_build_cos(struct lp_build_context *bld, 1246 LLVMValueRef a) 1247{ 1248 return lp_build_sincos(bld, "cosf", &c_cosf, a); 1249} 1250 1251LLVMValueRef 1252lp_build_sin(struct lp_build_context *bld, 1253 LLVMValueRef a) 1254{ 1255 return lp_build_sincos(bld, "sinf", &c_sinf, a); 1256} 1257 1258#else /* !PIPE_OS_WINDOWS */ 1259 1260/** 1261 * Generate cos(a) 1262 */ 1263LLVMValueRef 1264lp_build_cos(struct lp_build_context *bld, 1265 LLVMValueRef a) 1266{ 1267 const struct lp_type type = bld->type; 1268 LLVMTypeRef vec_type = lp_build_vec_type(type); 1269 char intrinsic[32]; 1270 1271 /* TODO: optimize the constant case */ 1272 1273 assert(type.floating); 1274 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width); 1275 1276 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); 1277} 1278 1279 1280/** 1281 * Generate sin(a) 1282 */ 1283LLVMValueRef 1284lp_build_sin(struct lp_build_context *bld, 1285 LLVMValueRef a) 1286{ 1287 const struct lp_type type = bld->type; 1288 LLVMTypeRef vec_type = lp_build_vec_type(type); 1289 char intrinsic[32]; 1290 1291 /* TODO: optimize the constant case */ 1292 1293 assert(type.floating); 1294 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width); 1295 1296 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); 1297} 1298 1299#endif /* !PIPE_OS_WINDOWS */ 1300 1301 1302/** 1303 * Generate pow(x, y) 1304 */ 1305LLVMValueRef 1306lp_build_pow(struct lp_build_context *bld, 1307 LLVMValueRef x, 1308 LLVMValueRef y) 1309{ 1310 /* TODO: optimize the constant case */ 1311 if(LLVMIsConstant(x) && LLVMIsConstant(y)) 1312 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1313 __FUNCTION__); 1314 1315 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 1316} 1317 1318 1319/** 1320 * Generate exp(x) 1321 */ 1322LLVMValueRef 1323lp_build_exp(struct lp_build_context *bld, 1324 LLVMValueRef x) 1325{ 1326 /* log2(e) = 1/log(2) */ 1327 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634); 1328 1329 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x)); 1330} 1331 1332 1333/** 1334 * Generate log(x) 1335 */ 1336LLVMValueRef 1337lp_build_log(struct lp_build_context *bld, 1338 LLVMValueRef x) 1339{ 1340 /* log(2) */ 1341 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529); 1342 1343 return lp_build_mul(bld, log2, lp_build_exp2(bld, x)); 1344} 1345 1346 1347#define EXP_POLY_DEGREE 3 1348#define LOG_POLY_DEGREE 5 1349 1350 1351/** 1352 * Generate polynomial. 1353 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 1354 */ 1355static LLVMValueRef 1356lp_build_polynomial(struct lp_build_context *bld, 1357 LLVMValueRef x, 1358 const double *coeffs, 1359 unsigned num_coeffs) 1360{ 1361 const struct lp_type type = bld->type; 1362 LLVMValueRef res = NULL; 1363 unsigned i; 1364 1365 /* TODO: optimize the constant case */ 1366 if(LLVMIsConstant(x)) 1367 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1368 __FUNCTION__); 1369 1370 for (i = num_coeffs; i--; ) { 1371 LLVMValueRef coeff; 1372 1373 coeff = lp_build_const_vec(type, coeffs[i]); 1374 1375 if(res) 1376 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); 1377 else 1378 res = coeff; 1379 } 1380 1381 if(res) 1382 return res; 1383 else 1384 return bld->undef; 1385} 1386 1387 1388/** 1389 * Minimax polynomial fit of 2**x, in range [0, 1[ 1390 */ 1391const double lp_build_exp2_polynomial[] = { 1392#if EXP_POLY_DEGREE == 5 1393 0.999999999690134838155, 1394 0.583974334321735217258, 1395 0.164553105719676828492, 1396 0.0292811063701710962255, 1397 0.00354944426657875141846, 1398 0.000296253726543423377365 1399#elif EXP_POLY_DEGREE == 4 1400 1.00000001502262084505, 1401 0.563586057338685991394, 1402 0.150436017652442413623, 1403 0.0243220604213317927308, 1404 0.0025359088446580436489 1405#elif EXP_POLY_DEGREE == 3 1406 0.999925218562710312959, 1407 0.695833540494823811697, 1408 0.226067155427249155588, 1409 0.0780245226406372992967 1410#elif EXP_POLY_DEGREE == 2 1411 1.00172476321474503578, 1412 0.657636275736077639316, 1413 0.33718943461968720704 1414#else 1415#error 1416#endif 1417}; 1418 1419 1420void 1421lp_build_exp2_approx(struct lp_build_context *bld, 1422 LLVMValueRef x, 1423 LLVMValueRef *p_exp2_int_part, 1424 LLVMValueRef *p_frac_part, 1425 LLVMValueRef *p_exp2) 1426{ 1427 const struct lp_type type = bld->type; 1428 LLVMTypeRef vec_type = lp_build_vec_type(type); 1429 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1430 LLVMValueRef ipart = NULL; 1431 LLVMValueRef fpart = NULL; 1432 LLVMValueRef expipart = NULL; 1433 LLVMValueRef expfpart = NULL; 1434 LLVMValueRef res = NULL; 1435 1436 if(p_exp2_int_part || p_frac_part || p_exp2) { 1437 /* TODO: optimize the constant case */ 1438 if(LLVMIsConstant(x)) 1439 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1440 __FUNCTION__); 1441 1442 assert(type.floating && type.width == 32); 1443 1444 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0)); 1445 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999)); 1446 1447 /* ipart = floor(x) */ 1448 ipart = lp_build_floor(bld, x); 1449 1450 /* fpart = x - ipart */ 1451 fpart = LLVMBuildSub(bld->builder, x, ipart, ""); 1452 } 1453 1454 if(p_exp2_int_part || p_exp2) { 1455 /* expipart = (float) (1 << ipart) */ 1456 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, ""); 1457 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), ""); 1458 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), ""); 1459 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, ""); 1460 } 1461 1462 if(p_exp2) { 1463 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 1464 Elements(lp_build_exp2_polynomial)); 1465 1466 res = LLVMBuildMul(bld->builder, expipart, expfpart, ""); 1467 } 1468 1469 if(p_exp2_int_part) 1470 *p_exp2_int_part = expipart; 1471 1472 if(p_frac_part) 1473 *p_frac_part = fpart; 1474 1475 if(p_exp2) 1476 *p_exp2 = res; 1477} 1478 1479 1480LLVMValueRef 1481lp_build_exp2(struct lp_build_context *bld, 1482 LLVMValueRef x) 1483{ 1484 LLVMValueRef res; 1485 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 1486 return res; 1487} 1488 1489 1490/** 1491 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 1492 * These coefficients can be generate with 1493 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 1494 */ 1495const double lp_build_log2_polynomial[] = { 1496#if LOG_POLY_DEGREE == 6 1497 3.11578814719469302614, 1498 -3.32419399085241980044, 1499 2.59883907202499966007, 1500 -1.23152682416275988241, 1501 0.318212422185251071475, 1502 -0.0344359067839062357313 1503#elif LOG_POLY_DEGREE == 5 1504 2.8882704548164776201, 1505 -2.52074962577807006663, 1506 1.48116647521213171641, 1507 -0.465725644288844778798, 1508 0.0596515482674574969533 1509#elif LOG_POLY_DEGREE == 4 1510 2.61761038894603480148, 1511 -1.75647175389045657003, 1512 0.688243882994381274313, 1513 -0.107254423828329604454 1514#elif LOG_POLY_DEGREE == 3 1515 2.28330284476918490682, 1516 -1.04913055217340124191, 1517 0.204446009836232697516 1518#else 1519#error 1520#endif 1521}; 1522 1523 1524/** 1525 * See http://www.devmaster.net/forums/showthread.php?p=43580 1526 */ 1527void 1528lp_build_log2_approx(struct lp_build_context *bld, 1529 LLVMValueRef x, 1530 LLVMValueRef *p_exp, 1531 LLVMValueRef *p_floor_log2, 1532 LLVMValueRef *p_log2) 1533{ 1534 const struct lp_type type = bld->type; 1535 LLVMTypeRef vec_type = lp_build_vec_type(type); 1536 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1537 1538 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000); 1539 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff); 1540 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 1541 1542 LLVMValueRef i = NULL; 1543 LLVMValueRef exp = NULL; 1544 LLVMValueRef mant = NULL; 1545 LLVMValueRef logexp = NULL; 1546 LLVMValueRef logmant = NULL; 1547 LLVMValueRef res = NULL; 1548 1549 if(p_exp || p_floor_log2 || p_log2) { 1550 /* TODO: optimize the constant case */ 1551 if(LLVMIsConstant(x)) 1552 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1553 __FUNCTION__); 1554 1555 assert(type.floating && type.width == 32); 1556 1557 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); 1558 1559 /* exp = (float) exponent(x) */ 1560 exp = LLVMBuildAnd(bld->builder, i, expmask, ""); 1561 } 1562 1563 if(p_floor_log2 || p_log2) { 1564 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), ""); 1565 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), ""); 1566 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, ""); 1567 } 1568 1569 if(p_log2) { 1570 /* mant = (float) mantissa(x) */ 1571 mant = LLVMBuildAnd(bld->builder, i, mantmask, ""); 1572 mant = LLVMBuildOr(bld->builder, mant, one, ""); 1573 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, ""); 1574 1575 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, 1576 Elements(lp_build_log2_polynomial)); 1577 1578 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ 1579 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), ""); 1580 1581 res = LLVMBuildAdd(bld->builder, logmant, logexp, ""); 1582 } 1583 1584 if(p_exp) { 1585 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, ""); 1586 *p_exp = exp; 1587 } 1588 1589 if(p_floor_log2) 1590 *p_floor_log2 = logexp; 1591 1592 if(p_log2) 1593 *p_log2 = res; 1594} 1595 1596 1597LLVMValueRef 1598lp_build_log2(struct lp_build_context *bld, 1599 LLVMValueRef x) 1600{ 1601 LLVMValueRef res; 1602 lp_build_log2_approx(bld, x, NULL, NULL, &res); 1603 return res; 1604} 1605