lp_bld_arit.c revision 3c929e55449410f97c7d9213d09aa88ef02c888c
1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include "util/u_memory.h" 49#include "util/u_debug.h" 50#include "util/u_math.h" 51#include "util/u_string.h" 52#include "util/u_cpu_detect.h" 53 54#include "lp_bld_type.h" 55#include "lp_bld_const.h" 56#include "lp_bld_intr.h" 57#include "lp_bld_init.h" /* for lp_build_engine */ 58#include "lp_bld_logic.h" 59#include "lp_bld_pack.h" 60#include "lp_bld_debug.h" 61#include "lp_bld_arit.h" 62#include "lp_bld_printf.h" 63 64 65/** 66 * Generate min(a, b) 67 * No checks for special case values of a or b = 1 or 0 are done. 68 */ 69static LLVMValueRef 70lp_build_min_simple(struct lp_build_context *bld, 71 LLVMValueRef a, 72 LLVMValueRef b) 73{ 74 const struct lp_type type = bld->type; 75 const char *intrinsic = NULL; 76 LLVMValueRef cond; 77 78 /* TODO: optimize the constant case */ 79 80 if(type.width * type.length == 128) { 81 if(type.floating) { 82 if(type.width == 32 && util_cpu_caps.has_sse) 83 intrinsic = "llvm.x86.sse.min.ps"; 84 if(type.width == 64 && util_cpu_caps.has_sse2) 85 intrinsic = "llvm.x86.sse2.min.pd"; 86 } 87 else { 88 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 89 intrinsic = "llvm.x86.sse2.pminu.b"; 90 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 91 intrinsic = "llvm.x86.sse41.pminsb"; 92 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 93 intrinsic = "llvm.x86.sse41.pminuw"; 94 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 95 intrinsic = "llvm.x86.sse2.pmins.w"; 96 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 97 intrinsic = "llvm.x86.sse41.pminud"; 98 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 99 intrinsic = "llvm.x86.sse41.pminsd"; 100 } 101 } 102 103 if(intrinsic) 104 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 105 106 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 107 return lp_build_select(bld, cond, a, b); 108} 109 110 111/** 112 * Generate max(a, b) 113 * No checks for special case values of a or b = 1 or 0 are done. 114 */ 115static LLVMValueRef 116lp_build_max_simple(struct lp_build_context *bld, 117 LLVMValueRef a, 118 LLVMValueRef b) 119{ 120 const struct lp_type type = bld->type; 121 const char *intrinsic = NULL; 122 LLVMValueRef cond; 123 124 /* TODO: optimize the constant case */ 125 126 if(type.width * type.length == 128) { 127 if(type.floating) { 128 if(type.width == 32 && util_cpu_caps.has_sse) 129 intrinsic = "llvm.x86.sse.max.ps"; 130 if(type.width == 64 && util_cpu_caps.has_sse2) 131 intrinsic = "llvm.x86.sse2.max.pd"; 132 } 133 else { 134 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 135 intrinsic = "llvm.x86.sse2.pmaxu.b"; 136 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 137 intrinsic = "llvm.x86.sse41.pmaxsb"; 138 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 139 intrinsic = "llvm.x86.sse41.pmaxuw"; 140 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 141 intrinsic = "llvm.x86.sse2.pmaxs.w"; 142 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 143 intrinsic = "llvm.x86.sse41.pmaxud"; 144 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 145 intrinsic = "llvm.x86.sse41.pmaxsd"; 146 } 147 } 148 149 if(intrinsic) 150 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 151 152 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 153 return lp_build_select(bld, cond, a, b); 154} 155 156 157/** 158 * Generate 1 - a, or ~a depending on bld->type. 159 */ 160LLVMValueRef 161lp_build_comp(struct lp_build_context *bld, 162 LLVMValueRef a) 163{ 164 const struct lp_type type = bld->type; 165 166 if(a == bld->one) 167 return bld->zero; 168 if(a == bld->zero) 169 return bld->one; 170 171 if(type.norm && !type.floating && !type.fixed && !type.sign) { 172 if(LLVMIsConstant(a)) 173 return LLVMConstNot(a); 174 else 175 return LLVMBuildNot(bld->builder, a, ""); 176 } 177 178 if(LLVMIsConstant(a)) 179 return LLVMConstSub(bld->one, a); 180 else 181 return LLVMBuildSub(bld->builder, bld->one, a, ""); 182} 183 184 185/** 186 * Generate a + b 187 */ 188LLVMValueRef 189lp_build_add(struct lp_build_context *bld, 190 LLVMValueRef a, 191 LLVMValueRef b) 192{ 193 const struct lp_type type = bld->type; 194 LLVMValueRef res; 195 196 if(a == bld->zero) 197 return b; 198 if(b == bld->zero) 199 return a; 200 if(a == bld->undef || b == bld->undef) 201 return bld->undef; 202 203 if(bld->type.norm) { 204 const char *intrinsic = NULL; 205 206 if(a == bld->one || b == bld->one) 207 return bld->one; 208 209 if(util_cpu_caps.has_sse2 && 210 type.width * type.length == 128 && 211 !type.floating && !type.fixed) { 212 if(type.width == 8) 213 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 214 if(type.width == 16) 215 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 216 } 217 218 if(intrinsic) 219 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 220 } 221 222 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 223 res = LLVMConstAdd(a, b); 224 else 225 res = LLVMBuildAdd(bld->builder, a, b, ""); 226 227 /* clamp to ceiling of 1.0 */ 228 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 229 res = lp_build_min_simple(bld, res, bld->one); 230 231 /* XXX clamp to floor of -1 or 0??? */ 232 233 return res; 234} 235 236 237/** Return the sum of the elements of a */ 238LLVMValueRef 239lp_build_sum_vector(struct lp_build_context *bld, 240 LLVMValueRef a) 241{ 242 const struct lp_type type = bld->type; 243 LLVMValueRef index, res; 244 unsigned i; 245 246 if (a == bld->zero) 247 return bld->zero; 248 if (a == bld->undef) 249 return bld->undef; 250 assert(type.length > 1); 251 252 assert(!bld->type.norm); 253 254 index = LLVMConstInt(LLVMInt32Type(), 0, 0); 255 res = LLVMBuildExtractElement(bld->builder, a, index, ""); 256 257 for (i = 1; i < type.length; i++) { 258 index = LLVMConstInt(LLVMInt32Type(), i, 0); 259 res = LLVMBuildAdd(bld->builder, res, 260 LLVMBuildExtractElement(bld->builder, a, index, ""), 261 ""); 262 } 263 264 return res; 265} 266 267 268/** 269 * Generate a - b 270 */ 271LLVMValueRef 272lp_build_sub(struct lp_build_context *bld, 273 LLVMValueRef a, 274 LLVMValueRef b) 275{ 276 const struct lp_type type = bld->type; 277 LLVMValueRef res; 278 279 if(b == bld->zero) 280 return a; 281 if(a == bld->undef || b == bld->undef) 282 return bld->undef; 283 if(a == b) 284 return bld->zero; 285 286 if(bld->type.norm) { 287 const char *intrinsic = NULL; 288 289 if(b == bld->one) 290 return bld->zero; 291 292 if(util_cpu_caps.has_sse2 && 293 type.width * type.length == 128 && 294 !type.floating && !type.fixed) { 295 if(type.width == 8) 296 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 297 if(type.width == 16) 298 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 299 } 300 301 if(intrinsic) 302 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 303 } 304 305 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 306 res = LLVMConstSub(a, b); 307 else 308 res = LLVMBuildSub(bld->builder, a, b, ""); 309 310 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 311 res = lp_build_max_simple(bld, res, bld->zero); 312 313 return res; 314} 315 316 317/** 318 * Normalized 8bit multiplication. 319 * 320 * - alpha plus one 321 * 322 * makes the following approximation to the division (Sree) 323 * 324 * a*b/255 ~= (a*(b + 1)) >> 256 325 * 326 * which is the fastest method that satisfies the following OpenGL criteria 327 * 328 * 0*0 = 0 and 255*255 = 255 329 * 330 * - geometric series 331 * 332 * takes the geometric series approximation to the division 333 * 334 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 335 * 336 * in this case just the first two terms to fit in 16bit arithmetic 337 * 338 * t/255 ~= (t + (t >> 8)) >> 8 339 * 340 * note that just by itself it doesn't satisfies the OpenGL criteria, as 341 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 342 * must be used 343 * 344 * - geometric series plus rounding 345 * 346 * when using a geometric series division instead of truncating the result 347 * use roundoff in the approximation (Jim Blinn) 348 * 349 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 350 * 351 * achieving the exact results 352 * 353 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 354 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 355 * @sa Michael Herf, The "double blend trick", May 2000, 356 * http://www.stereopsis.com/doubleblend.html 357 */ 358static LLVMValueRef 359lp_build_mul_u8n(LLVMBuilderRef builder, 360 struct lp_type i16_type, 361 LLVMValueRef a, LLVMValueRef b) 362{ 363 LLVMValueRef c8; 364 LLVMValueRef ab; 365 366 c8 = lp_build_const_int_vec(i16_type, 8); 367 368#if 0 369 370 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 371 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), ""); 372 ab = LLVMBuildMul(builder, a, b, ""); 373 374#else 375 376 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 377 ab = LLVMBuildMul(builder, a, b, ""); 378 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 379 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), ""); 380 381#endif 382 383 ab = LLVMBuildLShr(builder, ab, c8, ""); 384 385 return ab; 386} 387 388 389/** 390 * Generate a * b 391 */ 392LLVMValueRef 393lp_build_mul(struct lp_build_context *bld, 394 LLVMValueRef a, 395 LLVMValueRef b) 396{ 397 const struct lp_type type = bld->type; 398 LLVMValueRef shift; 399 LLVMValueRef res; 400 401 if(a == bld->zero) 402 return bld->zero; 403 if(a == bld->one) 404 return b; 405 if(b == bld->zero) 406 return bld->zero; 407 if(b == bld->one) 408 return a; 409 if(a == bld->undef || b == bld->undef) 410 return bld->undef; 411 412 if(!type.floating && !type.fixed && type.norm) { 413 if(type.width == 8) { 414 struct lp_type i16_type = lp_wider_type(type); 415 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 416 417 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah); 418 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh); 419 420 /* PMULLW, PSRLW, PADDW */ 421 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl); 422 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh); 423 424 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh); 425 426 return ab; 427 } 428 429 /* FIXME */ 430 assert(0); 431 } 432 433 if(type.fixed) 434 shift = lp_build_const_int_vec(type, type.width/2); 435 else 436 shift = NULL; 437 438 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 439 res = LLVMConstMul(a, b); 440 if(shift) { 441 if(type.sign) 442 res = LLVMConstAShr(res, shift); 443 else 444 res = LLVMConstLShr(res, shift); 445 } 446 } 447 else { 448 res = LLVMBuildMul(bld->builder, a, b, ""); 449 if(shift) { 450 if(type.sign) 451 res = LLVMBuildAShr(bld->builder, res, shift, ""); 452 else 453 res = LLVMBuildLShr(bld->builder, res, shift, ""); 454 } 455 } 456 457 return res; 458} 459 460 461/** 462 * Small vector x scale multiplication optimization. 463 */ 464LLVMValueRef 465lp_build_mul_imm(struct lp_build_context *bld, 466 LLVMValueRef a, 467 int b) 468{ 469 LLVMValueRef factor; 470 471 if(b == 0) 472 return bld->zero; 473 474 if(b == 1) 475 return a; 476 477 if(b == -1) 478 return LLVMBuildNeg(bld->builder, a, ""); 479 480 if(b == 2 && bld->type.floating) 481 return lp_build_add(bld, a, a); 482 483 if(util_is_pot(b)) { 484 unsigned shift = ffs(b) - 1; 485 486 if(bld->type.floating) { 487#if 0 488 /* 489 * Power of two multiplication by directly manipulating the mantissa. 490 * 491 * XXX: This might not be always faster, it will introduce a small error 492 * for multiplication by zero, and it will produce wrong results 493 * for Inf and NaN. 494 */ 495 unsigned mantissa = lp_mantissa(bld->type); 496 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa); 497 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), ""); 498 a = LLVMBuildAdd(bld->builder, a, factor, ""); 499 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), ""); 500 return a; 501#endif 502 } 503 else { 504 factor = lp_build_const_vec(bld->type, shift); 505 return LLVMBuildShl(bld->builder, a, factor, ""); 506 } 507 } 508 509 factor = lp_build_const_vec(bld->type, (double)b); 510 return lp_build_mul(bld, a, factor); 511} 512 513 514/** 515 * Generate a / b 516 */ 517LLVMValueRef 518lp_build_div(struct lp_build_context *bld, 519 LLVMValueRef a, 520 LLVMValueRef b) 521{ 522 const struct lp_type type = bld->type; 523 524 if(a == bld->zero) 525 return bld->zero; 526 if(a == bld->one) 527 return lp_build_rcp(bld, b); 528 if(b == bld->zero) 529 return bld->undef; 530 if(b == bld->one) 531 return a; 532 if(a == bld->undef || b == bld->undef) 533 return bld->undef; 534 535 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 536 return LLVMConstFDiv(a, b); 537 538 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 539 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 540 541 return LLVMBuildFDiv(bld->builder, a, b, ""); 542} 543 544 545/** 546 * Linear interpolation. 547 * 548 * This also works for integer values with a few caveats. 549 * 550 * @sa http://www.stereopsis.com/doubleblend.html 551 */ 552LLVMValueRef 553lp_build_lerp(struct lp_build_context *bld, 554 LLVMValueRef x, 555 LLVMValueRef v0, 556 LLVMValueRef v1) 557{ 558 LLVMValueRef delta; 559 LLVMValueRef res; 560 561 delta = lp_build_sub(bld, v1, v0); 562 563 res = lp_build_mul(bld, x, delta); 564 565 res = lp_build_add(bld, v0, res); 566 567 if(bld->type.fixed) 568 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 569 * but it will be wrong for other uses. Basically we need a more 570 * powerful lp_type, capable of further distinguishing the values 571 * interpretation from the value storage. */ 572 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), ""); 573 574 return res; 575} 576 577 578LLVMValueRef 579lp_build_lerp_2d(struct lp_build_context *bld, 580 LLVMValueRef x, 581 LLVMValueRef y, 582 LLVMValueRef v00, 583 LLVMValueRef v01, 584 LLVMValueRef v10, 585 LLVMValueRef v11) 586{ 587 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 588 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 589 return lp_build_lerp(bld, y, v0, v1); 590} 591 592 593/** 594 * Generate min(a, b) 595 * Do checks for special cases. 596 */ 597LLVMValueRef 598lp_build_min(struct lp_build_context *bld, 599 LLVMValueRef a, 600 LLVMValueRef b) 601{ 602 if(a == bld->undef || b == bld->undef) 603 return bld->undef; 604 605 if(a == b) 606 return a; 607 608 if(bld->type.norm) { 609 if(a == bld->zero || b == bld->zero) 610 return bld->zero; 611 if(a == bld->one) 612 return b; 613 if(b == bld->one) 614 return a; 615 } 616 617 return lp_build_min_simple(bld, a, b); 618} 619 620 621/** 622 * Generate max(a, b) 623 * Do checks for special cases. 624 */ 625LLVMValueRef 626lp_build_max(struct lp_build_context *bld, 627 LLVMValueRef a, 628 LLVMValueRef b) 629{ 630 if(a == bld->undef || b == bld->undef) 631 return bld->undef; 632 633 if(a == b) 634 return a; 635 636 if(bld->type.norm) { 637 if(a == bld->one || b == bld->one) 638 return bld->one; 639 if(a == bld->zero) 640 return b; 641 if(b == bld->zero) 642 return a; 643 } 644 645 return lp_build_max_simple(bld, a, b); 646} 647 648 649/** 650 * Generate clamp(a, min, max) 651 * Do checks for special cases. 652 */ 653LLVMValueRef 654lp_build_clamp(struct lp_build_context *bld, 655 LLVMValueRef a, 656 LLVMValueRef min, 657 LLVMValueRef max) 658{ 659 a = lp_build_min(bld, a, max); 660 a = lp_build_max(bld, a, min); 661 return a; 662} 663 664 665/** 666 * Generate abs(a) 667 */ 668LLVMValueRef 669lp_build_abs(struct lp_build_context *bld, 670 LLVMValueRef a) 671{ 672 const struct lp_type type = bld->type; 673 LLVMTypeRef vec_type = lp_build_vec_type(type); 674 675 if(!type.sign) 676 return a; 677 678 if(type.floating) { 679 /* Mask out the sign bit */ 680 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 681 unsigned long long absMask = ~(1ULL << (type.width - 1)); 682 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask)); 683 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 684 a = LLVMBuildAnd(bld->builder, a, mask, ""); 685 a = LLVMBuildBitCast(bld->builder, a, vec_type, ""); 686 return a; 687 } 688 689 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 690 switch(type.width) { 691 case 8: 692 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 693 case 16: 694 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 695 case 32: 696 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 697 } 698 } 699 700 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, "")); 701} 702 703 704LLVMValueRef 705lp_build_negate(struct lp_build_context *bld, 706 LLVMValueRef a) 707{ 708 return LLVMBuildNeg(bld->builder, a, ""); 709} 710 711 712/** Return -1, 0 or +1 depending on the sign of a */ 713LLVMValueRef 714lp_build_sgn(struct lp_build_context *bld, 715 LLVMValueRef a) 716{ 717 const struct lp_type type = bld->type; 718 LLVMValueRef cond; 719 LLVMValueRef res; 720 721 /* Handle non-zero case */ 722 if(!type.sign) { 723 /* if not zero then sign must be positive */ 724 res = bld->one; 725 } 726 else if(type.floating) { 727 LLVMTypeRef vec_type; 728 LLVMTypeRef int_type; 729 LLVMValueRef mask; 730 LLVMValueRef sign; 731 LLVMValueRef one; 732 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 733 734 int_type = lp_build_int_vec_type(type); 735 vec_type = lp_build_vec_type(type); 736 mask = lp_build_const_int_vec(type, maskBit); 737 738 /* Take the sign bit and add it to 1 constant */ 739 sign = LLVMBuildBitCast(bld->builder, a, int_type, ""); 740 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 741 one = LLVMConstBitCast(bld->one, int_type); 742 res = LLVMBuildOr(bld->builder, sign, one, ""); 743 res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); 744 } 745 else 746 { 747 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0); 748 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 749 res = lp_build_select(bld, cond, bld->one, minus_one); 750 } 751 752 /* Handle zero */ 753 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 754 res = lp_build_select(bld, cond, bld->zero, res); 755 756 return res; 757} 758 759 760/** 761 * Set the sign of float vector 'a' according to 'sign'. 762 * If sign==0, return abs(a). 763 * If sign==1, return -abs(a); 764 * Other values for sign produce undefined results. 765 */ 766LLVMValueRef 767lp_build_set_sign(struct lp_build_context *bld, 768 LLVMValueRef a, LLVMValueRef sign) 769{ 770 const struct lp_type type = bld->type; 771 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 772 LLVMTypeRef vec_type = lp_build_vec_type(type); 773 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1); 774 LLVMValueRef mask = lp_build_const_int_vec(type, 775 ~((unsigned long long) 1 << (type.width - 1))); 776 LLVMValueRef val, res; 777 778 assert(type.floating); 779 780 /* val = reinterpret_cast<int>(a) */ 781 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 782 /* val = val & mask */ 783 val = LLVMBuildAnd(bld->builder, val, mask, ""); 784 /* sign = sign << shift */ 785 sign = LLVMBuildShl(bld->builder, sign, shift, ""); 786 /* res = val | sign */ 787 res = LLVMBuildOr(bld->builder, val, sign, ""); 788 /* res = reinterpret_cast<float>(res) */ 789 res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); 790 791 return res; 792} 793 794 795/** 796 * Convert vector of (or scalar) int to vector of (or scalar) float. 797 */ 798LLVMValueRef 799lp_build_int_to_float(struct lp_build_context *bld, 800 LLVMValueRef a) 801{ 802 const struct lp_type type = bld->type; 803 LLVMTypeRef vec_type = lp_build_vec_type(type); 804 805 assert(type.floating); 806 807 return LLVMBuildSIToFP(bld->builder, a, vec_type, ""); 808} 809 810 811 812enum lp_build_round_sse41_mode 813{ 814 LP_BUILD_ROUND_SSE41_NEAREST = 0, 815 LP_BUILD_ROUND_SSE41_FLOOR = 1, 816 LP_BUILD_ROUND_SSE41_CEIL = 2, 817 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 818}; 819 820 821static INLINE LLVMValueRef 822lp_build_round_sse41(struct lp_build_context *bld, 823 LLVMValueRef a, 824 enum lp_build_round_sse41_mode mode) 825{ 826 const struct lp_type type = bld->type; 827 LLVMTypeRef vec_type = lp_build_vec_type(type); 828 const char *intrinsic; 829 830 assert(type.floating); 831 assert(type.width*type.length == 128); 832 assert(lp_check_value(type, a)); 833 assert(util_cpu_caps.has_sse4_1); 834 835 switch(type.width) { 836 case 32: 837 intrinsic = "llvm.x86.sse41.round.ps"; 838 break; 839 case 64: 840 intrinsic = "llvm.x86.sse41.round.pd"; 841 break; 842 default: 843 assert(0); 844 return bld->undef; 845 } 846 847 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, 848 LLVMConstInt(LLVMInt32Type(), mode, 0)); 849} 850 851 852LLVMValueRef 853lp_build_trunc(struct lp_build_context *bld, 854 LLVMValueRef a) 855{ 856 const struct lp_type type = bld->type; 857 858 assert(type.floating); 859 assert(lp_check_value(type, a)); 860 861 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 862 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 863 else { 864 LLVMTypeRef vec_type = lp_build_vec_type(type); 865 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 866 LLVMValueRef res; 867 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); 868 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 869 return res; 870 } 871} 872 873 874LLVMValueRef 875lp_build_round(struct lp_build_context *bld, 876 LLVMValueRef a) 877{ 878 const struct lp_type type = bld->type; 879 880 assert(type.floating); 881 assert(lp_check_value(type, a)); 882 883 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 884 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 885 else { 886 LLVMTypeRef vec_type = lp_build_vec_type(type); 887 LLVMValueRef res; 888 res = lp_build_iround(bld, a); 889 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 890 return res; 891 } 892} 893 894 895LLVMValueRef 896lp_build_floor(struct lp_build_context *bld, 897 LLVMValueRef a) 898{ 899 const struct lp_type type = bld->type; 900 901 assert(type.floating); 902 assert(lp_check_value(type, a)); 903 904 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 905 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 906 else { 907 LLVMTypeRef vec_type = lp_build_vec_type(type); 908 LLVMValueRef res; 909 res = lp_build_ifloor(bld, a); 910 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 911 return res; 912 } 913} 914 915 916LLVMValueRef 917lp_build_ceil(struct lp_build_context *bld, 918 LLVMValueRef a) 919{ 920 const struct lp_type type = bld->type; 921 922 assert(type.floating); 923 assert(lp_check_value(type, a)); 924 925 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 926 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 927 else { 928 LLVMTypeRef vec_type = lp_build_vec_type(type); 929 LLVMValueRef res; 930 res = lp_build_iceil(bld, a); 931 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 932 return res; 933 } 934} 935 936 937/** 938 * Return fractional part of 'a' computed as a - floor(f) 939 * Typically used in texture coord arithmetic. 940 */ 941LLVMValueRef 942lp_build_fract(struct lp_build_context *bld, 943 LLVMValueRef a) 944{ 945 assert(bld->type.floating); 946 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 947} 948 949 950/** 951 * Convert to integer, through whichever rounding method that's fastest, 952 * typically truncating toward zero. 953 */ 954LLVMValueRef 955lp_build_itrunc(struct lp_build_context *bld, 956 LLVMValueRef a) 957{ 958 const struct lp_type type = bld->type; 959 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 960 961 assert(type.floating); 962 assert(lp_check_value(type, a)); 963 964 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); 965} 966 967 968/** 969 * Convert float[] to int[] with round(). 970 */ 971LLVMValueRef 972lp_build_iround(struct lp_build_context *bld, 973 LLVMValueRef a) 974{ 975 const struct lp_type type = bld->type; 976 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 977 LLVMValueRef res; 978 979 assert(type.floating); 980 981 assert(lp_check_value(type, a)); 982 983 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 984 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 985 } 986 else { 987 LLVMTypeRef vec_type = lp_build_vec_type(type); 988 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 989 LLVMValueRef sign; 990 LLVMValueRef half; 991 992 /* get sign bit */ 993 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 994 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 995 996 /* sign * 0.5 */ 997 half = lp_build_const_vec(type, 0.5); 998 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); 999 half = LLVMBuildOr(bld->builder, sign, half, ""); 1000 half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); 1001 1002 res = LLVMBuildAdd(bld->builder, a, half, ""); 1003 } 1004 1005 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1006 1007 return res; 1008} 1009 1010 1011/** 1012 * Convert float[] to int[] with floor(). 1013 */ 1014LLVMValueRef 1015lp_build_ifloor(struct lp_build_context *bld, 1016 LLVMValueRef a) 1017{ 1018 const struct lp_type type = bld->type; 1019 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1020 LLVMValueRef res; 1021 1022 assert(type.floating); 1023 assert(lp_check_value(type, a)); 1024 1025 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1026 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1027 } 1028 else { 1029 /* Take the sign bit and add it to 1 constant */ 1030 LLVMTypeRef vec_type = lp_build_vec_type(type); 1031 unsigned mantissa = lp_mantissa(type); 1032 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 1033 LLVMValueRef sign; 1034 LLVMValueRef offset; 1035 1036 /* sign = a < 0 ? ~0 : 0 */ 1037 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 1038 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 1039 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), ""); 1040 lp_build_name(sign, "floor.sign"); 1041 1042 /* offset = -0.99999(9)f */ 1043 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa)); 1044 offset = LLVMConstBitCast(offset, int_vec_type); 1045 1046 /* offset = a < 0 ? -0.99999(9)f : 0.0f */ 1047 offset = LLVMBuildAnd(bld->builder, offset, sign, ""); 1048 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, ""); 1049 lp_build_name(offset, "floor.offset"); 1050 1051 res = LLVMBuildAdd(bld->builder, a, offset, ""); 1052 lp_build_name(res, "floor.res"); 1053 } 1054 1055 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1056 lp_build_name(res, "floor"); 1057 1058 return res; 1059} 1060 1061 1062LLVMValueRef 1063lp_build_iceil(struct lp_build_context *bld, 1064 LLVMValueRef a) 1065{ 1066 const struct lp_type type = bld->type; 1067 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1068 LLVMValueRef res; 1069 1070 assert(type.floating); 1071 assert(lp_check_value(type, a)); 1072 1073 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1074 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1075 } 1076 else { 1077 /* TODO: mimic lp_build_ifloor() here */ 1078 assert(0); 1079 res = bld->undef; 1080 } 1081 1082 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1083 1084 return res; 1085} 1086 1087 1088LLVMValueRef 1089lp_build_sqrt(struct lp_build_context *bld, 1090 LLVMValueRef a) 1091{ 1092 const struct lp_type type = bld->type; 1093 LLVMTypeRef vec_type = lp_build_vec_type(type); 1094 char intrinsic[32]; 1095 1096 /* TODO: optimize the constant case */ 1097 /* TODO: optimize the constant case */ 1098 1099 assert(type.floating); 1100 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1101 1102 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); 1103} 1104 1105 1106LLVMValueRef 1107lp_build_rcp(struct lp_build_context *bld, 1108 LLVMValueRef a) 1109{ 1110 const struct lp_type type = bld->type; 1111 1112 if(a == bld->zero) 1113 return bld->undef; 1114 if(a == bld->one) 1115 return bld->one; 1116 if(a == bld->undef) 1117 return bld->undef; 1118 1119 assert(type.floating); 1120 1121 if(LLVMIsConstant(a)) 1122 return LLVMConstFDiv(bld->one, a); 1123 1124 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1125 /* 1126 * XXX: Added precision is not always necessary, so only enable this 1127 * when we have a better system in place to track minimum precision. 1128 */ 1129 1130#if 0 1131 /* 1132 * Do one Newton-Raphson step to improve precision: 1133 * 1134 * x1 = (2 - a * rcp(a)) * rcp(a) 1135 */ 1136 1137 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); 1138 LLVMValueRef rcp_a; 1139 LLVMValueRef res; 1140 1141 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); 1142 1143 res = LLVMBuildMul(bld->builder, a, rcp_a, ""); 1144 res = LLVMBuildSub(bld->builder, two, res, ""); 1145 res = LLVMBuildMul(bld->builder, res, rcp_a, ""); 1146 1147 return rcp_a; 1148#else 1149 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); 1150#endif 1151 } 1152 1153 return LLVMBuildFDiv(bld->builder, bld->one, a, ""); 1154} 1155 1156 1157/** 1158 * Generate 1/sqrt(a) 1159 */ 1160LLVMValueRef 1161lp_build_rsqrt(struct lp_build_context *bld, 1162 LLVMValueRef a) 1163{ 1164 const struct lp_type type = bld->type; 1165 1166 assert(type.floating); 1167 1168 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 1169 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a); 1170 1171 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1172} 1173 1174 1175static inline LLVMValueRef 1176lp_build_const_v4si(unsigned long value) 1177{ 1178 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0); 1179 LLVMValueRef elements[4] = { element, element, element, element }; 1180 return LLVMConstVector(elements, 4); 1181} 1182 1183static inline LLVMValueRef 1184lp_build_const_v4sf(float value) 1185{ 1186 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value); 1187 LLVMValueRef elements[4] = { element, element, element, element }; 1188 return LLVMConstVector(elements, 4); 1189} 1190 1191 1192/** 1193 * Generate sin(a) using SSE2 1194 */ 1195LLVMValueRef 1196lp_build_sin(struct lp_build_context *bld, 1197 LLVMValueRef a) 1198{ 1199 struct lp_type int_type = lp_int_type(bld->type); 1200 LLVMBuilderRef b = bld->builder; 1201 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); 1202 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4); 1203 1204 /* 1205 * take the absolute value, 1206 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1207 */ 1208 1209 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000); 1210 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); 1211 1212 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1213 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); 1214 1215 /* 1216 * extract the sign bit (upper one) 1217 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 1218 */ 1219 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000); 1220 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 1221 1222 /* 1223 * scale by 4/Pi 1224 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1225 */ 1226 1227 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); 1228 LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y"); 1229 1230 /* 1231 * store the integer part of y in mm0 1232 * emm2 = _mm_cvttps_epi32(y); 1233 */ 1234 1235 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); 1236 1237 /* 1238 * j=(j+1) & (~1) (see the cephes sources) 1239 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1240 */ 1241 1242 LLVMValueRef all_one = lp_build_const_v4si(1); 1243 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1244 /* 1245 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1246 */ 1247 LLVMValueRef inv_one = lp_build_const_v4si(~1); 1248 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1249 1250 /* 1251 * y = _mm_cvtepi32_ps(emm2); 1252 */ 1253 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); 1254 1255 /* get the swap sign flag 1256 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 1257 */ 1258 LLVMValueRef pi32_4 = lp_build_const_v4si(4); 1259 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 1260 1261 /* 1262 * emm2 = _mm_slli_epi32(emm0, 29); 1263 */ 1264 LLVMValueRef const_29 = lp_build_const_v4si(29); 1265 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 1266 1267 /* 1268 * get the polynom selection mask 1269 * there is one polynom for 0 <= x <= Pi/4 1270 * and another one for Pi/4<x<=Pi/2 1271 * Both branches will be computed. 1272 * 1273 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1274 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1275 */ 1276 1277 LLVMValueRef pi32_2 = lp_build_const_v4si(2); 1278 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 1279 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL, 1280 emm2_3, lp_build_const_v4si(0)); 1281 /* 1282 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 1283 */ 1284 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 1285 1286 /* 1287 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1288 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1289 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1290 */ 1291 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625); 1292 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4); 1293 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8); 1294 1295 /* 1296 * The magic pass: "Extended precision modular arithmetic" 1297 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1298 * xmm1 = _mm_mul_ps(y, xmm1); 1299 * xmm2 = _mm_mul_ps(y, xmm2); 1300 * xmm3 = _mm_mul_ps(y, xmm3); 1301 */ 1302 LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1"); 1303 LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2"); 1304 LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3"); 1305 1306 /* 1307 * x = _mm_add_ps(x, xmm1); 1308 * x = _mm_add_ps(x, xmm2); 1309 * x = _mm_add_ps(x, xmm3); 1310 */ 1311 1312 LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1"); 1313 LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2"); 1314 LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3"); 1315 1316 /* 1317 * Evaluate the first polynom (0 <= x <= Pi/4) 1318 * 1319 * z = _mm_mul_ps(x,x); 1320 */ 1321 LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z"); 1322 1323 /* 1324 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1325 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1326 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1327 */ 1328 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005); 1329 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003); 1330 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002); 1331 1332 /* 1333 * y = *(v4sf*)_ps_coscof_p0; 1334 * y = _mm_mul_ps(y, z); 1335 */ 1336 LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3"); 1337 LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4"); 1338 LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5"); 1339 LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6"); 1340 LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7"); 1341 LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8"); 1342 1343 1344 /* 1345 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1346 * y = _mm_sub_ps(y, tmp); 1347 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1348 */ 1349 LLVMValueRef half = lp_build_const_v4sf(0.5); 1350 LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp"); 1351 LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8"); 1352 LLVMValueRef one = lp_build_const_v4sf(1.0); 1353 LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9"); 1354 1355 /* 1356 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1357 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1358 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1359 */ 1360 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4); 1361 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3); 1362 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1); 1363 1364 /* 1365 * Evaluate the second polynom (Pi/4 <= x <= 0) 1366 * 1367 * y2 = *(v4sf*)_ps_sincof_p0; 1368 * y2 = _mm_mul_ps(y2, z); 1369 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1370 * y2 = _mm_mul_ps(y2, z); 1371 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1372 * y2 = _mm_mul_ps(y2, z); 1373 * y2 = _mm_mul_ps(y2, x); 1374 * y2 = _mm_add_ps(y2, x); 1375 */ 1376 1377 LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3"); 1378 LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4"); 1379 LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5"); 1380 LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6"); 1381 LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7"); 1382 LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8"); 1383 LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9"); 1384 1385 /* 1386 * select the correct result from the two polynoms 1387 * xmm3 = poly_mask; 1388 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1389 * y = _mm_andnot_ps(xmm3, y); 1390 * y = _mm_add_ps(y,y2); 1391 */ 1392 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); 1393 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); 1394 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1395 LLVMValueRef inv = lp_build_const_v4si(~0); 1396 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1397 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1398 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1399 1400 /* 1401 * update the sign 1402 * y = _mm_xor_ps(y, sign_bit); 1403 */ 1404 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 1405 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); 1406 return y_result; 1407} 1408 1409 1410/** 1411 * Generate cos(a) using SSE2 1412 */ 1413LLVMValueRef 1414lp_build_cos(struct lp_build_context *bld, 1415 LLVMValueRef a) 1416{ 1417 struct lp_type int_type = lp_int_type(bld->type); 1418 LLVMBuilderRef b = bld->builder; 1419 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); 1420 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4); 1421 1422 /* 1423 * take the absolute value, 1424 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1425 */ 1426 1427 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000); 1428 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); 1429 1430 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1431 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); 1432 1433 /* 1434 * scale by 4/Pi 1435 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1436 */ 1437 1438 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); 1439 LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y"); 1440 1441 /* 1442 * store the integer part of y in mm0 1443 * emm2 = _mm_cvttps_epi32(y); 1444 */ 1445 1446 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); 1447 1448 /* 1449 * j=(j+1) & (~1) (see the cephes sources) 1450 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1451 */ 1452 1453 LLVMValueRef all_one = lp_build_const_v4si(1); 1454 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1455 /* 1456 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1457 */ 1458 LLVMValueRef inv_one = lp_build_const_v4si(~1); 1459 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1460 1461 /* 1462 * y = _mm_cvtepi32_ps(emm2); 1463 */ 1464 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); 1465 1466 1467 /* 1468 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 1469 */ 1470 LLVMValueRef const_2 = lp_build_const_v4si(2); 1471 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 1472 1473 1474 /* get the swap sign flag 1475 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 1476 */ 1477 LLVMValueRef inv = lp_build_const_v4si(~0); 1478 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 1479 LLVMValueRef pi32_4 = lp_build_const_v4si(4); 1480 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 1481 1482 /* 1483 * emm2 = _mm_slli_epi32(emm0, 29); 1484 */ 1485 LLVMValueRef const_29 = lp_build_const_v4si(29); 1486 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 1487 1488 /* 1489 * get the polynom selection mask 1490 * there is one polynom for 0 <= x <= Pi/4 1491 * and another one for Pi/4<x<=Pi/2 1492 * Both branches will be computed. 1493 * 1494 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1495 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1496 */ 1497 1498 LLVMValueRef pi32_2 = lp_build_const_v4si(2); 1499 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 1500 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL, 1501 emm2_3, lp_build_const_v4si(0)); 1502 1503 /* 1504 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1505 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1506 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1507 */ 1508 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625); 1509 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4); 1510 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8); 1511 1512 /* 1513 * The magic pass: "Extended precision modular arithmetic" 1514 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1515 * xmm1 = _mm_mul_ps(y, xmm1); 1516 * xmm2 = _mm_mul_ps(y, xmm2); 1517 * xmm3 = _mm_mul_ps(y, xmm3); 1518 */ 1519 LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1"); 1520 LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2"); 1521 LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3"); 1522 1523 /* 1524 * x = _mm_add_ps(x, xmm1); 1525 * x = _mm_add_ps(x, xmm2); 1526 * x = _mm_add_ps(x, xmm3); 1527 */ 1528 1529 LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1"); 1530 LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2"); 1531 LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3"); 1532 1533 /* 1534 * Evaluate the first polynom (0 <= x <= Pi/4) 1535 * 1536 * z = _mm_mul_ps(x,x); 1537 */ 1538 LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z"); 1539 1540 /* 1541 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1542 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1543 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1544 */ 1545 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005); 1546 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003); 1547 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002); 1548 1549 /* 1550 * y = *(v4sf*)_ps_coscof_p0; 1551 * y = _mm_mul_ps(y, z); 1552 */ 1553 LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3"); 1554 LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4"); 1555 LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5"); 1556 LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6"); 1557 LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7"); 1558 LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8"); 1559 1560 1561 /* 1562 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1563 * y = _mm_sub_ps(y, tmp); 1564 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1565 */ 1566 LLVMValueRef half = lp_build_const_v4sf(0.5); 1567 LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp"); 1568 LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8"); 1569 LLVMValueRef one = lp_build_const_v4sf(1.0); 1570 LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9"); 1571 1572 /* 1573 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1574 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1575 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1576 */ 1577 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4); 1578 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3); 1579 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1); 1580 1581 /* 1582 * Evaluate the second polynom (Pi/4 <= x <= 0) 1583 * 1584 * y2 = *(v4sf*)_ps_sincof_p0; 1585 * y2 = _mm_mul_ps(y2, z); 1586 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1587 * y2 = _mm_mul_ps(y2, z); 1588 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1589 * y2 = _mm_mul_ps(y2, z); 1590 * y2 = _mm_mul_ps(y2, x); 1591 * y2 = _mm_add_ps(y2, x); 1592 */ 1593 1594 LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3"); 1595 LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4"); 1596 LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5"); 1597 LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6"); 1598 LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7"); 1599 LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8"); 1600 LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9"); 1601 1602 /* 1603 * select the correct result from the two polynoms 1604 * xmm3 = poly_mask; 1605 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1606 * y = _mm_andnot_ps(xmm3, y); 1607 * y = _mm_add_ps(y,y2); 1608 */ 1609 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); 1610 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); 1611 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1612 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1613 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1614 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1615 1616 /* 1617 * update the sign 1618 * y = _mm_xor_ps(y, sign_bit); 1619 */ 1620 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 1621 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); 1622 return y_result; 1623} 1624 1625 1626/** 1627 * Generate pow(x, y) 1628 */ 1629LLVMValueRef 1630lp_build_pow(struct lp_build_context *bld, 1631 LLVMValueRef x, 1632 LLVMValueRef y) 1633{ 1634 /* TODO: optimize the constant case */ 1635 if(LLVMIsConstant(x) && LLVMIsConstant(y)) 1636 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1637 __FUNCTION__); 1638 1639 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 1640} 1641 1642 1643/** 1644 * Generate exp(x) 1645 */ 1646LLVMValueRef 1647lp_build_exp(struct lp_build_context *bld, 1648 LLVMValueRef x) 1649{ 1650 /* log2(e) = 1/log(2) */ 1651 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634); 1652 1653 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x)); 1654} 1655 1656 1657/** 1658 * Generate log(x) 1659 */ 1660LLVMValueRef 1661lp_build_log(struct lp_build_context *bld, 1662 LLVMValueRef x) 1663{ 1664 /* log(2) */ 1665 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529); 1666 1667 return lp_build_mul(bld, log2, lp_build_exp2(bld, x)); 1668} 1669 1670 1671#define EXP_POLY_DEGREE 3 1672#define LOG_POLY_DEGREE 5 1673 1674 1675/** 1676 * Generate polynomial. 1677 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 1678 */ 1679static LLVMValueRef 1680lp_build_polynomial(struct lp_build_context *bld, 1681 LLVMValueRef x, 1682 const double *coeffs, 1683 unsigned num_coeffs) 1684{ 1685 const struct lp_type type = bld->type; 1686 LLVMValueRef res = NULL; 1687 unsigned i; 1688 1689 /* TODO: optimize the constant case */ 1690 if(LLVMIsConstant(x)) 1691 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1692 __FUNCTION__); 1693 1694 for (i = num_coeffs; i--; ) { 1695 LLVMValueRef coeff; 1696 1697 coeff = lp_build_const_vec(type, coeffs[i]); 1698 1699 if(res) 1700 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); 1701 else 1702 res = coeff; 1703 } 1704 1705 if(res) 1706 return res; 1707 else 1708 return bld->undef; 1709} 1710 1711 1712/** 1713 * Minimax polynomial fit of 2**x, in range [0, 1[ 1714 */ 1715const double lp_build_exp2_polynomial[] = { 1716#if EXP_POLY_DEGREE == 5 1717 0.999999999690134838155, 1718 0.583974334321735217258, 1719 0.164553105719676828492, 1720 0.0292811063701710962255, 1721 0.00354944426657875141846, 1722 0.000296253726543423377365 1723#elif EXP_POLY_DEGREE == 4 1724 1.00000001502262084505, 1725 0.563586057338685991394, 1726 0.150436017652442413623, 1727 0.0243220604213317927308, 1728 0.0025359088446580436489 1729#elif EXP_POLY_DEGREE == 3 1730 0.999925218562710312959, 1731 0.695833540494823811697, 1732 0.226067155427249155588, 1733 0.0780245226406372992967 1734#elif EXP_POLY_DEGREE == 2 1735 1.00172476321474503578, 1736 0.657636275736077639316, 1737 0.33718943461968720704 1738#else 1739#error 1740#endif 1741}; 1742 1743 1744void 1745lp_build_exp2_approx(struct lp_build_context *bld, 1746 LLVMValueRef x, 1747 LLVMValueRef *p_exp2_int_part, 1748 LLVMValueRef *p_frac_part, 1749 LLVMValueRef *p_exp2) 1750{ 1751 const struct lp_type type = bld->type; 1752 LLVMTypeRef vec_type = lp_build_vec_type(type); 1753 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1754 LLVMValueRef ipart = NULL; 1755 LLVMValueRef fpart = NULL; 1756 LLVMValueRef expipart = NULL; 1757 LLVMValueRef expfpart = NULL; 1758 LLVMValueRef res = NULL; 1759 1760 if(p_exp2_int_part || p_frac_part || p_exp2) { 1761 /* TODO: optimize the constant case */ 1762 if(LLVMIsConstant(x)) 1763 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1764 __FUNCTION__); 1765 1766 assert(type.floating && type.width == 32); 1767 1768 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0)); 1769 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999)); 1770 1771 /* ipart = floor(x) */ 1772 ipart = lp_build_floor(bld, x); 1773 1774 /* fpart = x - ipart */ 1775 fpart = LLVMBuildSub(bld->builder, x, ipart, ""); 1776 } 1777 1778 if(p_exp2_int_part || p_exp2) { 1779 /* expipart = (float) (1 << ipart) */ 1780 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, ""); 1781 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), ""); 1782 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), ""); 1783 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, ""); 1784 } 1785 1786 if(p_exp2) { 1787 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 1788 Elements(lp_build_exp2_polynomial)); 1789 1790 res = LLVMBuildMul(bld->builder, expipart, expfpart, ""); 1791 } 1792 1793 if(p_exp2_int_part) 1794 *p_exp2_int_part = expipart; 1795 1796 if(p_frac_part) 1797 *p_frac_part = fpart; 1798 1799 if(p_exp2) 1800 *p_exp2 = res; 1801} 1802 1803 1804LLVMValueRef 1805lp_build_exp2(struct lp_build_context *bld, 1806 LLVMValueRef x) 1807{ 1808 LLVMValueRef res; 1809 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 1810 return res; 1811} 1812 1813 1814/** 1815 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 1816 * These coefficients can be generate with 1817 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 1818 */ 1819const double lp_build_log2_polynomial[] = { 1820#if LOG_POLY_DEGREE == 6 1821 3.11578814719469302614, 1822 -3.32419399085241980044, 1823 2.59883907202499966007, 1824 -1.23152682416275988241, 1825 0.318212422185251071475, 1826 -0.0344359067839062357313 1827#elif LOG_POLY_DEGREE == 5 1828 2.8882704548164776201, 1829 -2.52074962577807006663, 1830 1.48116647521213171641, 1831 -0.465725644288844778798, 1832 0.0596515482674574969533 1833#elif LOG_POLY_DEGREE == 4 1834 2.61761038894603480148, 1835 -1.75647175389045657003, 1836 0.688243882994381274313, 1837 -0.107254423828329604454 1838#elif LOG_POLY_DEGREE == 3 1839 2.28330284476918490682, 1840 -1.04913055217340124191, 1841 0.204446009836232697516 1842#else 1843#error 1844#endif 1845}; 1846 1847 1848/** 1849 * See http://www.devmaster.net/forums/showthread.php?p=43580 1850 */ 1851void 1852lp_build_log2_approx(struct lp_build_context *bld, 1853 LLVMValueRef x, 1854 LLVMValueRef *p_exp, 1855 LLVMValueRef *p_floor_log2, 1856 LLVMValueRef *p_log2) 1857{ 1858 const struct lp_type type = bld->type; 1859 LLVMTypeRef vec_type = lp_build_vec_type(type); 1860 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1861 1862 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000); 1863 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff); 1864 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 1865 1866 LLVMValueRef i = NULL; 1867 LLVMValueRef exp = NULL; 1868 LLVMValueRef mant = NULL; 1869 LLVMValueRef logexp = NULL; 1870 LLVMValueRef logmant = NULL; 1871 LLVMValueRef res = NULL; 1872 1873 if(p_exp || p_floor_log2 || p_log2) { 1874 /* TODO: optimize the constant case */ 1875 if(LLVMIsConstant(x)) 1876 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1877 __FUNCTION__); 1878 1879 assert(type.floating && type.width == 32); 1880 1881 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); 1882 1883 /* exp = (float) exponent(x) */ 1884 exp = LLVMBuildAnd(bld->builder, i, expmask, ""); 1885 } 1886 1887 if(p_floor_log2 || p_log2) { 1888 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), ""); 1889 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), ""); 1890 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, ""); 1891 } 1892 1893 if(p_log2) { 1894 /* mant = (float) mantissa(x) */ 1895 mant = LLVMBuildAnd(bld->builder, i, mantmask, ""); 1896 mant = LLVMBuildOr(bld->builder, mant, one, ""); 1897 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, ""); 1898 1899 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, 1900 Elements(lp_build_log2_polynomial)); 1901 1902 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ 1903 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), ""); 1904 1905 res = LLVMBuildAdd(bld->builder, logmant, logexp, ""); 1906 } 1907 1908 if(p_exp) { 1909 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, ""); 1910 *p_exp = exp; 1911 } 1912 1913 if(p_floor_log2) 1914 *p_floor_log2 = logexp; 1915 1916 if(p_log2) 1917 *p_log2 = res; 1918} 1919 1920 1921LLVMValueRef 1922lp_build_log2(struct lp_build_context *bld, 1923 LLVMValueRef x) 1924{ 1925 LLVMValueRef res; 1926 lp_build_log2_approx(bld, x, NULL, NULL, &res); 1927 return res; 1928} 1929