lp_bld_arit.c revision 8a3a971743a90463e65b44f1769a5301a31ce4cd
1/************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include "util/u_memory.h" 49#include "util/u_debug.h" 50#include "util/u_math.h" 51#include "util/u_string.h" 52#include "util/u_cpu_detect.h" 53 54#include "lp_bld_type.h" 55#include "lp_bld_const.h" 56#include "lp_bld_intr.h" 57#include "lp_bld_logic.h" 58#include "lp_bld_pack.h" 59#include "lp_bld_arit.h" 60 61 62/** 63 * Generate min(a, b) 64 * No checks for special case values of a or b = 1 or 0 are done. 65 */ 66static LLVMValueRef 67lp_build_min_simple(struct lp_build_context *bld, 68 LLVMValueRef a, 69 LLVMValueRef b) 70{ 71 const struct lp_type type = bld->type; 72 const char *intrinsic = NULL; 73 LLVMValueRef cond; 74 75 /* TODO: optimize the constant case */ 76 77 if(type.width * type.length == 128) { 78 if(type.floating) { 79 if(type.width == 32 && util_cpu_caps.has_sse) 80 intrinsic = "llvm.x86.sse.min.ps"; 81 if(type.width == 64 && util_cpu_caps.has_sse2) 82 intrinsic = "llvm.x86.sse2.min.pd"; 83 } 84 else { 85 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 86 intrinsic = "llvm.x86.sse2.pminu.b"; 87 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 88 intrinsic = "llvm.x86.sse41.pminsb"; 89 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 90 intrinsic = "llvm.x86.sse41.pminuw"; 91 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 92 intrinsic = "llvm.x86.sse2.pmins.w"; 93 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 94 intrinsic = "llvm.x86.sse41.pminud"; 95 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 96 intrinsic = "llvm.x86.sse41.pminsd"; 97 } 98 } 99 100 if(intrinsic) 101 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 102 103 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 104 return lp_build_select(bld, cond, a, b); 105} 106 107 108/** 109 * Generate max(a, b) 110 * No checks for special case values of a or b = 1 or 0 are done. 111 */ 112static LLVMValueRef 113lp_build_max_simple(struct lp_build_context *bld, 114 LLVMValueRef a, 115 LLVMValueRef b) 116{ 117 const struct lp_type type = bld->type; 118 const char *intrinsic = NULL; 119 LLVMValueRef cond; 120 121 /* TODO: optimize the constant case */ 122 123 if(type.width * type.length == 128) { 124 if(type.floating) { 125 if(type.width == 32 && util_cpu_caps.has_sse) 126 intrinsic = "llvm.x86.sse.max.ps"; 127 if(type.width == 64 && util_cpu_caps.has_sse2) 128 intrinsic = "llvm.x86.sse2.max.pd"; 129 } 130 else { 131 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 132 intrinsic = "llvm.x86.sse2.pmaxu.b"; 133 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 134 intrinsic = "llvm.x86.sse41.pmaxsb"; 135 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 136 intrinsic = "llvm.x86.sse41.pmaxuw"; 137 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 138 intrinsic = "llvm.x86.sse2.pmaxs.w"; 139 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 140 intrinsic = "llvm.x86.sse41.pmaxud"; 141 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 142 intrinsic = "llvm.x86.sse41.pmaxsd"; 143 } 144 } 145 146 if(intrinsic) 147 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 148 149 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 150 return lp_build_select(bld, cond, a, b); 151} 152 153 154/** 155 * Generate 1 - a, or ~a depending on bld->type. 156 */ 157LLVMValueRef 158lp_build_comp(struct lp_build_context *bld, 159 LLVMValueRef a) 160{ 161 const struct lp_type type = bld->type; 162 163 if(a == bld->one) 164 return bld->zero; 165 if(a == bld->zero) 166 return bld->one; 167 168 if(type.norm && !type.floating && !type.fixed && !type.sign) { 169 if(LLVMIsConstant(a)) 170 return LLVMConstNot(a); 171 else 172 return LLVMBuildNot(bld->builder, a, ""); 173 } 174 175 if(LLVMIsConstant(a)) 176 if (type.floating) 177 return LLVMConstFSub(bld->one, a); 178 else 179 return LLVMConstSub(bld->one, a); 180 else 181 if (type.floating) 182 return LLVMBuildFSub(bld->builder, bld->one, a, ""); 183 else 184 return LLVMBuildSub(bld->builder, bld->one, a, ""); 185} 186 187 188/** 189 * Generate a + b 190 */ 191LLVMValueRef 192lp_build_add(struct lp_build_context *bld, 193 LLVMValueRef a, 194 LLVMValueRef b) 195{ 196 const struct lp_type type = bld->type; 197 LLVMValueRef res; 198 199 assert(lp_check_value(type, a)); 200 assert(lp_check_value(type, b)); 201 202 if(a == bld->zero) 203 return b; 204 if(b == bld->zero) 205 return a; 206 if(a == bld->undef || b == bld->undef) 207 return bld->undef; 208 209 if(bld->type.norm) { 210 const char *intrinsic = NULL; 211 212 if(a == bld->one || b == bld->one) 213 return bld->one; 214 215 if(util_cpu_caps.has_sse2 && 216 type.width * type.length == 128 && 217 !type.floating && !type.fixed) { 218 if(type.width == 8) 219 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 220 if(type.width == 16) 221 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 222 } 223 224 if(intrinsic) 225 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 226 } 227 228 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 229 if (type.floating) 230 res = LLVMConstFAdd(a, b); 231 else 232 res = LLVMConstAdd(a, b); 233 else 234 if (type.floating) 235 res = LLVMBuildFAdd(bld->builder, a, b, ""); 236 else 237 res = LLVMBuildAdd(bld->builder, a, b, ""); 238 239 /* clamp to ceiling of 1.0 */ 240 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 241 res = lp_build_min_simple(bld, res, bld->one); 242 243 /* XXX clamp to floor of -1 or 0??? */ 244 245 return res; 246} 247 248 249/** Return the sum of the elements of a */ 250LLVMValueRef 251lp_build_sum_vector(struct lp_build_context *bld, 252 LLVMValueRef a) 253{ 254 const struct lp_type type = bld->type; 255 LLVMValueRef index, res; 256 unsigned i; 257 258 if (a == bld->zero) 259 return bld->zero; 260 if (a == bld->undef) 261 return bld->undef; 262 assert(type.length > 1); 263 264 assert(!bld->type.norm); 265 266 index = LLVMConstInt(LLVMInt32Type(), 0, 0); 267 res = LLVMBuildExtractElement(bld->builder, a, index, ""); 268 269 for (i = 1; i < type.length; i++) { 270 index = LLVMConstInt(LLVMInt32Type(), i, 0); 271 if (type.floating) 272 res = LLVMBuildFAdd(bld->builder, res, 273 LLVMBuildExtractElement(bld->builder, 274 a, index, ""), 275 ""); 276 else 277 res = LLVMBuildAdd(bld->builder, res, 278 LLVMBuildExtractElement(bld->builder, 279 a, index, ""), 280 ""); 281 } 282 283 return res; 284} 285 286 287/** 288 * Generate a - b 289 */ 290LLVMValueRef 291lp_build_sub(struct lp_build_context *bld, 292 LLVMValueRef a, 293 LLVMValueRef b) 294{ 295 const struct lp_type type = bld->type; 296 LLVMValueRef res; 297 298 assert(lp_check_value(type, a)); 299 assert(lp_check_value(type, b)); 300 301 if(b == bld->zero) 302 return a; 303 if(a == bld->undef || b == bld->undef) 304 return bld->undef; 305 if(a == b) 306 return bld->zero; 307 308 if(bld->type.norm) { 309 const char *intrinsic = NULL; 310 311 if(b == bld->one) 312 return bld->zero; 313 314 if(util_cpu_caps.has_sse2 && 315 type.width * type.length == 128 && 316 !type.floating && !type.fixed) { 317 if(type.width == 8) 318 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 319 if(type.width == 16) 320 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 321 } 322 323 if(intrinsic) 324 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); 325 } 326 327 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 328 if (type.floating) 329 res = LLVMConstFSub(a, b); 330 else 331 res = LLVMConstSub(a, b); 332 else 333 if (type.floating) 334 res = LLVMBuildFSub(bld->builder, a, b, ""); 335 else 336 res = LLVMBuildSub(bld->builder, a, b, ""); 337 338 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 339 res = lp_build_max_simple(bld, res, bld->zero); 340 341 return res; 342} 343 344 345/** 346 * Normalized 8bit multiplication. 347 * 348 * - alpha plus one 349 * 350 * makes the following approximation to the division (Sree) 351 * 352 * a*b/255 ~= (a*(b + 1)) >> 256 353 * 354 * which is the fastest method that satisfies the following OpenGL criteria 355 * 356 * 0*0 = 0 and 255*255 = 255 357 * 358 * - geometric series 359 * 360 * takes the geometric series approximation to the division 361 * 362 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 363 * 364 * in this case just the first two terms to fit in 16bit arithmetic 365 * 366 * t/255 ~= (t + (t >> 8)) >> 8 367 * 368 * note that just by itself it doesn't satisfies the OpenGL criteria, as 369 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 370 * must be used 371 * 372 * - geometric series plus rounding 373 * 374 * when using a geometric series division instead of truncating the result 375 * use roundoff in the approximation (Jim Blinn) 376 * 377 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 378 * 379 * achieving the exact results 380 * 381 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 382 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 383 * @sa Michael Herf, The "double blend trick", May 2000, 384 * http://www.stereopsis.com/doubleblend.html 385 */ 386static LLVMValueRef 387lp_build_mul_u8n(LLVMBuilderRef builder, 388 struct lp_type i16_type, 389 LLVMValueRef a, LLVMValueRef b) 390{ 391 LLVMValueRef c8; 392 LLVMValueRef ab; 393 394 c8 = lp_build_const_int_vec(i16_type, 8); 395 396#if 0 397 398 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 399 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), ""); 400 ab = LLVMBuildMul(builder, a, b, ""); 401 402#else 403 404 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 405 ab = LLVMBuildMul(builder, a, b, ""); 406 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 407 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), ""); 408 409#endif 410 411 ab = LLVMBuildLShr(builder, ab, c8, ""); 412 413 return ab; 414} 415 416 417/** 418 * Generate a * b 419 */ 420LLVMValueRef 421lp_build_mul(struct lp_build_context *bld, 422 LLVMValueRef a, 423 LLVMValueRef b) 424{ 425 const struct lp_type type = bld->type; 426 LLVMValueRef shift; 427 LLVMValueRef res; 428 429 assert(lp_check_value(type, a)); 430 assert(lp_check_value(type, b)); 431 432 if(a == bld->zero) 433 return bld->zero; 434 if(a == bld->one) 435 return b; 436 if(b == bld->zero) 437 return bld->zero; 438 if(b == bld->one) 439 return a; 440 if(a == bld->undef || b == bld->undef) 441 return bld->undef; 442 443 if(!type.floating && !type.fixed && type.norm) { 444 if(type.width == 8) { 445 struct lp_type i16_type = lp_wider_type(type); 446 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 447 448 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah); 449 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh); 450 451 /* PMULLW, PSRLW, PADDW */ 452 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl); 453 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh); 454 455 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh); 456 457 return ab; 458 } 459 460 /* FIXME */ 461 assert(0); 462 } 463 464 if(type.fixed) 465 shift = lp_build_const_int_vec(type, type.width/2); 466 else 467 shift = NULL; 468 469 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 470 if (type.floating) 471 res = LLVMConstFMul(a, b); 472 else 473 res = LLVMConstMul(a, b); 474 if(shift) { 475 if(type.sign) 476 res = LLVMConstAShr(res, shift); 477 else 478 res = LLVMConstLShr(res, shift); 479 } 480 } 481 else { 482 if (type.floating) 483 res = LLVMBuildFMul(bld->builder, a, b, ""); 484 else 485 res = LLVMBuildMul(bld->builder, a, b, ""); 486 if(shift) { 487 if(type.sign) 488 res = LLVMBuildAShr(bld->builder, res, shift, ""); 489 else 490 res = LLVMBuildLShr(bld->builder, res, shift, ""); 491 } 492 } 493 494 return res; 495} 496 497 498/** 499 * Small vector x scale multiplication optimization. 500 */ 501LLVMValueRef 502lp_build_mul_imm(struct lp_build_context *bld, 503 LLVMValueRef a, 504 int b) 505{ 506 LLVMValueRef factor; 507 508 if(b == 0) 509 return bld->zero; 510 511 if(b == 1) 512 return a; 513 514 if(b == -1) 515 return lp_build_negate(bld, a); 516 517 if(b == 2 && bld->type.floating) 518 return lp_build_add(bld, a, a); 519 520 if(util_is_pot(b)) { 521 unsigned shift = ffs(b) - 1; 522 523 if(bld->type.floating) { 524#if 0 525 /* 526 * Power of two multiplication by directly manipulating the mantissa. 527 * 528 * XXX: This might not be always faster, it will introduce a small error 529 * for multiplication by zero, and it will produce wrong results 530 * for Inf and NaN. 531 */ 532 unsigned mantissa = lp_mantissa(bld->type); 533 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa); 534 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), ""); 535 a = LLVMBuildAdd(bld->builder, a, factor, ""); 536 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), ""); 537 return a; 538#endif 539 } 540 else { 541 factor = lp_build_const_vec(bld->type, shift); 542 return LLVMBuildShl(bld->builder, a, factor, ""); 543 } 544 } 545 546 factor = lp_build_const_vec(bld->type, (double)b); 547 return lp_build_mul(bld, a, factor); 548} 549 550 551/** 552 * Generate a / b 553 */ 554LLVMValueRef 555lp_build_div(struct lp_build_context *bld, 556 LLVMValueRef a, 557 LLVMValueRef b) 558{ 559 const struct lp_type type = bld->type; 560 561 assert(lp_check_value(type, a)); 562 assert(lp_check_value(type, b)); 563 564 if(a == bld->zero) 565 return bld->zero; 566 if(a == bld->one) 567 return lp_build_rcp(bld, b); 568 if(b == bld->zero) 569 return bld->undef; 570 if(b == bld->one) 571 return a; 572 if(a == bld->undef || b == bld->undef) 573 return bld->undef; 574 575 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 576 return LLVMConstFDiv(a, b); 577 578 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 579 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 580 581 return LLVMBuildFDiv(bld->builder, a, b, ""); 582} 583 584 585/** 586 * Linear interpolation. 587 * 588 * This also works for integer values with a few caveats. 589 * 590 * @sa http://www.stereopsis.com/doubleblend.html 591 */ 592LLVMValueRef 593lp_build_lerp(struct lp_build_context *bld, 594 LLVMValueRef x, 595 LLVMValueRef v0, 596 LLVMValueRef v1) 597{ 598 LLVMValueRef delta; 599 LLVMValueRef res; 600 601 delta = lp_build_sub(bld, v1, v0); 602 603 res = lp_build_mul(bld, x, delta); 604 605 res = lp_build_add(bld, v0, res); 606 607 if(bld->type.fixed) 608 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 609 * but it will be wrong for other uses. Basically we need a more 610 * powerful lp_type, capable of further distinguishing the values 611 * interpretation from the value storage. */ 612 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), ""); 613 614 return res; 615} 616 617 618LLVMValueRef 619lp_build_lerp_2d(struct lp_build_context *bld, 620 LLVMValueRef x, 621 LLVMValueRef y, 622 LLVMValueRef v00, 623 LLVMValueRef v01, 624 LLVMValueRef v10, 625 LLVMValueRef v11) 626{ 627 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 628 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 629 return lp_build_lerp(bld, y, v0, v1); 630} 631 632 633/** 634 * Generate min(a, b) 635 * Do checks for special cases. 636 */ 637LLVMValueRef 638lp_build_min(struct lp_build_context *bld, 639 LLVMValueRef a, 640 LLVMValueRef b) 641{ 642 if(a == bld->undef || b == bld->undef) 643 return bld->undef; 644 645 if(a == b) 646 return a; 647 648 if(bld->type.norm) { 649 if(a == bld->zero || b == bld->zero) 650 return bld->zero; 651 if(a == bld->one) 652 return b; 653 if(b == bld->one) 654 return a; 655 } 656 657 return lp_build_min_simple(bld, a, b); 658} 659 660 661/** 662 * Generate max(a, b) 663 * Do checks for special cases. 664 */ 665LLVMValueRef 666lp_build_max(struct lp_build_context *bld, 667 LLVMValueRef a, 668 LLVMValueRef b) 669{ 670 if(a == bld->undef || b == bld->undef) 671 return bld->undef; 672 673 if(a == b) 674 return a; 675 676 if(bld->type.norm) { 677 if(a == bld->one || b == bld->one) 678 return bld->one; 679 if(a == bld->zero) 680 return b; 681 if(b == bld->zero) 682 return a; 683 } 684 685 return lp_build_max_simple(bld, a, b); 686} 687 688 689/** 690 * Generate clamp(a, min, max) 691 * Do checks for special cases. 692 */ 693LLVMValueRef 694lp_build_clamp(struct lp_build_context *bld, 695 LLVMValueRef a, 696 LLVMValueRef min, 697 LLVMValueRef max) 698{ 699 a = lp_build_min(bld, a, max); 700 a = lp_build_max(bld, a, min); 701 return a; 702} 703 704 705/** 706 * Generate abs(a) 707 */ 708LLVMValueRef 709lp_build_abs(struct lp_build_context *bld, 710 LLVMValueRef a) 711{ 712 const struct lp_type type = bld->type; 713 LLVMTypeRef vec_type = lp_build_vec_type(type); 714 715 if(!type.sign) 716 return a; 717 718 if(type.floating) { 719 /* Mask out the sign bit */ 720 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 721 unsigned long long absMask = ~(1ULL << (type.width - 1)); 722 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask)); 723 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 724 a = LLVMBuildAnd(bld->builder, a, mask, ""); 725 a = LLVMBuildBitCast(bld->builder, a, vec_type, ""); 726 return a; 727 } 728 729 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 730 switch(type.width) { 731 case 8: 732 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 733 case 16: 734 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 735 case 32: 736 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 737 } 738 } 739 740 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, "")); 741} 742 743 744LLVMValueRef 745lp_build_negate(struct lp_build_context *bld, 746 LLVMValueRef a) 747{ 748#if HAVE_LLVM >= 0x0207 749 if (bld->type.floating) 750 a = LLVMBuildFNeg(bld->builder, a, ""); 751 else 752#endif 753 a = LLVMBuildNeg(bld->builder, a, ""); 754 755 return a; 756} 757 758 759/** Return -1, 0 or +1 depending on the sign of a */ 760LLVMValueRef 761lp_build_sgn(struct lp_build_context *bld, 762 LLVMValueRef a) 763{ 764 const struct lp_type type = bld->type; 765 LLVMValueRef cond; 766 LLVMValueRef res; 767 768 /* Handle non-zero case */ 769 if(!type.sign) { 770 /* if not zero then sign must be positive */ 771 res = bld->one; 772 } 773 else if(type.floating) { 774 LLVMTypeRef vec_type; 775 LLVMTypeRef int_type; 776 LLVMValueRef mask; 777 LLVMValueRef sign; 778 LLVMValueRef one; 779 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 780 781 int_type = lp_build_int_vec_type(type); 782 vec_type = lp_build_vec_type(type); 783 mask = lp_build_const_int_vec(type, maskBit); 784 785 /* Take the sign bit and add it to 1 constant */ 786 sign = LLVMBuildBitCast(bld->builder, a, int_type, ""); 787 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 788 one = LLVMConstBitCast(bld->one, int_type); 789 res = LLVMBuildOr(bld->builder, sign, one, ""); 790 res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); 791 } 792 else 793 { 794 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0); 795 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 796 res = lp_build_select(bld, cond, bld->one, minus_one); 797 } 798 799 /* Handle zero */ 800 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 801 res = lp_build_select(bld, cond, bld->zero, res); 802 803 return res; 804} 805 806 807/** 808 * Set the sign of float vector 'a' according to 'sign'. 809 * If sign==0, return abs(a). 810 * If sign==1, return -abs(a); 811 * Other values for sign produce undefined results. 812 */ 813LLVMValueRef 814lp_build_set_sign(struct lp_build_context *bld, 815 LLVMValueRef a, LLVMValueRef sign) 816{ 817 const struct lp_type type = bld->type; 818 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 819 LLVMTypeRef vec_type = lp_build_vec_type(type); 820 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1); 821 LLVMValueRef mask = lp_build_const_int_vec(type, 822 ~((unsigned long long) 1 << (type.width - 1))); 823 LLVMValueRef val, res; 824 825 assert(type.floating); 826 827 /* val = reinterpret_cast<int>(a) */ 828 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 829 /* val = val & mask */ 830 val = LLVMBuildAnd(bld->builder, val, mask, ""); 831 /* sign = sign << shift */ 832 sign = LLVMBuildShl(bld->builder, sign, shift, ""); 833 /* res = val | sign */ 834 res = LLVMBuildOr(bld->builder, val, sign, ""); 835 /* res = reinterpret_cast<float>(res) */ 836 res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); 837 838 return res; 839} 840 841 842/** 843 * Convert vector of (or scalar) int to vector of (or scalar) float. 844 */ 845LLVMValueRef 846lp_build_int_to_float(struct lp_build_context *bld, 847 LLVMValueRef a) 848{ 849 const struct lp_type type = bld->type; 850 LLVMTypeRef vec_type = lp_build_vec_type(type); 851 852 assert(type.floating); 853 854 return LLVMBuildSIToFP(bld->builder, a, vec_type, ""); 855} 856 857 858 859enum lp_build_round_sse41_mode 860{ 861 LP_BUILD_ROUND_SSE41_NEAREST = 0, 862 LP_BUILD_ROUND_SSE41_FLOOR = 1, 863 LP_BUILD_ROUND_SSE41_CEIL = 2, 864 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 865}; 866 867 868static INLINE LLVMValueRef 869lp_build_round_sse41(struct lp_build_context *bld, 870 LLVMValueRef a, 871 enum lp_build_round_sse41_mode mode) 872{ 873 const struct lp_type type = bld->type; 874 LLVMTypeRef vec_type = lp_build_vec_type(type); 875 const char *intrinsic; 876 877 assert(type.floating); 878 assert(type.width*type.length == 128); 879 assert(lp_check_value(type, a)); 880 assert(util_cpu_caps.has_sse4_1); 881 882 switch(type.width) { 883 case 32: 884 intrinsic = "llvm.x86.sse41.round.ps"; 885 break; 886 case 64: 887 intrinsic = "llvm.x86.sse41.round.pd"; 888 break; 889 default: 890 assert(0); 891 return bld->undef; 892 } 893 894 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, 895 LLVMConstInt(LLVMInt32Type(), mode, 0)); 896} 897 898 899/** 900 * Return the integer part of a float (vector) value. The returned value is 901 * a float (vector). 902 * Ex: trunc(-1.5) = 1.0 903 */ 904LLVMValueRef 905lp_build_trunc(struct lp_build_context *bld, 906 LLVMValueRef a) 907{ 908 const struct lp_type type = bld->type; 909 910 assert(type.floating); 911 assert(lp_check_value(type, a)); 912 913 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 914 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 915 else { 916 LLVMTypeRef vec_type = lp_build_vec_type(type); 917 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 918 LLVMValueRef res; 919 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); 920 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 921 return res; 922 } 923} 924 925 926/** 927 * Return float (vector) rounded to nearest integer (vector). The returned 928 * value is a float (vector). 929 * Ex: round(0.9) = 1.0 930 * Ex: round(-1.5) = -2.0 931 */ 932LLVMValueRef 933lp_build_round(struct lp_build_context *bld, 934 LLVMValueRef a) 935{ 936 const struct lp_type type = bld->type; 937 938 assert(type.floating); 939 assert(lp_check_value(type, a)); 940 941 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 942 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 943 else { 944 LLVMTypeRef vec_type = lp_build_vec_type(type); 945 LLVMValueRef res; 946 res = lp_build_iround(bld, a); 947 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 948 return res; 949 } 950} 951 952 953/** 954 * Return floor of float (vector), result is a float (vector) 955 * Ex: floor(1.1) = 1.0 956 * Ex: floor(-1.1) = -2.0 957 */ 958LLVMValueRef 959lp_build_floor(struct lp_build_context *bld, 960 LLVMValueRef a) 961{ 962 const struct lp_type type = bld->type; 963 964 assert(type.floating); 965 assert(lp_check_value(type, a)); 966 967 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 968 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 969 else { 970 LLVMTypeRef vec_type = lp_build_vec_type(type); 971 LLVMValueRef res; 972 res = lp_build_ifloor(bld, a); 973 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 974 return res; 975 } 976} 977 978 979/** 980 * Return ceiling of float (vector), returning float (vector). 981 * Ex: ceil( 1.1) = 2.0 982 * Ex: ceil(-1.1) = -1.0 983 */ 984LLVMValueRef 985lp_build_ceil(struct lp_build_context *bld, 986 LLVMValueRef a) 987{ 988 const struct lp_type type = bld->type; 989 990 assert(type.floating); 991 assert(lp_check_value(type, a)); 992 993 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) 994 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 995 else { 996 LLVMTypeRef vec_type = lp_build_vec_type(type); 997 LLVMValueRef res; 998 res = lp_build_iceil(bld, a); 999 res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); 1000 return res; 1001 } 1002} 1003 1004 1005/** 1006 * Return fractional part of 'a' computed as a - floor(a) 1007 * Typically used in texture coord arithmetic. 1008 */ 1009LLVMValueRef 1010lp_build_fract(struct lp_build_context *bld, 1011 LLVMValueRef a) 1012{ 1013 assert(bld->type.floating); 1014 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1015} 1016 1017 1018/** 1019 * Return the integer part of a float (vector) value. The returned value is 1020 * an integer (vector). 1021 * Ex: itrunc(-1.5) = 1 1022 */ 1023LLVMValueRef 1024lp_build_itrunc(struct lp_build_context *bld, 1025 LLVMValueRef a) 1026{ 1027 const struct lp_type type = bld->type; 1028 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1029 1030 assert(type.floating); 1031 assert(lp_check_value(type, a)); 1032 1033 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); 1034} 1035 1036 1037/** 1038 * Return float (vector) rounded to nearest integer (vector). The returned 1039 * value is an integer (vector). 1040 * Ex: iround(0.9) = 1 1041 * Ex: iround(-1.5) = -2 1042 */ 1043LLVMValueRef 1044lp_build_iround(struct lp_build_context *bld, 1045 LLVMValueRef a) 1046{ 1047 const struct lp_type type = bld->type; 1048 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1049 LLVMValueRef res; 1050 1051 assert(type.floating); 1052 1053 assert(lp_check_value(type, a)); 1054 1055 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1056 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1057 } 1058 else { 1059 LLVMTypeRef vec_type = lp_build_vec_type(type); 1060 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 1061 LLVMValueRef sign; 1062 LLVMValueRef half; 1063 1064 /* get sign bit */ 1065 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 1066 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 1067 1068 /* sign * 0.5 */ 1069 half = lp_build_const_vec(type, 0.5); 1070 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); 1071 half = LLVMBuildOr(bld->builder, sign, half, ""); 1072 half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); 1073 1074 res = LLVMBuildFAdd(bld->builder, a, half, ""); 1075 } 1076 1077 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); 1078 1079 return res; 1080} 1081 1082 1083/** 1084 * Return floor of float (vector), result is an int (vector) 1085 * Ex: ifloor(1.1) = 1.0 1086 * Ex: ifloor(-1.1) = -2.0 1087 */ 1088LLVMValueRef 1089lp_build_ifloor(struct lp_build_context *bld, 1090 LLVMValueRef a) 1091{ 1092 const struct lp_type type = bld->type; 1093 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1094 LLVMValueRef res; 1095 1096 assert(type.floating); 1097 assert(lp_check_value(type, a)); 1098 1099 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1100 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1101 } 1102 else { 1103 /* Take the sign bit and add it to 1 constant */ 1104 LLVMTypeRef vec_type = lp_build_vec_type(type); 1105 unsigned mantissa = lp_mantissa(type); 1106 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 1107 LLVMValueRef sign; 1108 LLVMValueRef offset; 1109 1110 /* sign = a < 0 ? ~0 : 0 */ 1111 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 1112 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 1113 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); 1114 1115 /* offset = -0.99999(9)f */ 1116 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1117 offset = LLVMConstBitCast(offset, int_vec_type); 1118 1119 /* offset = a < 0 ? offset : 0.0f */ 1120 offset = LLVMBuildAnd(bld->builder, offset, sign, ""); 1121 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); 1122 1123 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res"); 1124 } 1125 1126 /* round to nearest (toward zero) */ 1127 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res"); 1128 1129 return res; 1130} 1131 1132 1133/** 1134 * Return ceiling of float (vector), returning int (vector). 1135 * Ex: iceil( 1.1) = 2 1136 * Ex: iceil(-1.1) = -1 1137 */ 1138LLVMValueRef 1139lp_build_iceil(struct lp_build_context *bld, 1140 LLVMValueRef a) 1141{ 1142 const struct lp_type type = bld->type; 1143 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1144 LLVMValueRef res; 1145 1146 assert(type.floating); 1147 assert(lp_check_value(type, a)); 1148 1149 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { 1150 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1151 } 1152 else { 1153 LLVMTypeRef vec_type = lp_build_vec_type(type); 1154 unsigned mantissa = lp_mantissa(type); 1155 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); 1156 LLVMValueRef sign; 1157 LLVMValueRef offset; 1158 1159 /* sign = a < 0 ? 0 : ~0 */ 1160 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); 1161 sign = LLVMBuildAnd(bld->builder, sign, mask, ""); 1162 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); 1163 sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); 1164 1165 /* offset = 0.99999(9)f */ 1166 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1167 offset = LLVMConstBitCast(offset, int_vec_type); 1168 1169 /* offset = a < 0 ? 0.0 : offset */ 1170 offset = LLVMBuildAnd(bld->builder, offset, sign, ""); 1171 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); 1172 1173 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res"); 1174 } 1175 1176 /* round to nearest (toward zero) */ 1177 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res"); 1178 1179 return res; 1180} 1181 1182 1183LLVMValueRef 1184lp_build_sqrt(struct lp_build_context *bld, 1185 LLVMValueRef a) 1186{ 1187 const struct lp_type type = bld->type; 1188 LLVMTypeRef vec_type = lp_build_vec_type(type); 1189 char intrinsic[32]; 1190 1191 /* TODO: optimize the constant case */ 1192 /* TODO: optimize the constant case */ 1193 1194 assert(type.floating); 1195 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1196 1197 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); 1198} 1199 1200 1201LLVMValueRef 1202lp_build_rcp(struct lp_build_context *bld, 1203 LLVMValueRef a) 1204{ 1205 const struct lp_type type = bld->type; 1206 1207 if(a == bld->zero) 1208 return bld->undef; 1209 if(a == bld->one) 1210 return bld->one; 1211 if(a == bld->undef) 1212 return bld->undef; 1213 1214 assert(type.floating); 1215 1216 if(LLVMIsConstant(a)) 1217 return LLVMConstFDiv(bld->one, a); 1218 1219 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1220 /* 1221 * XXX: Added precision is not always necessary, so only enable this 1222 * when we have a better system in place to track minimum precision. 1223 */ 1224 1225#if 0 1226 /* 1227 * Do one Newton-Raphson step to improve precision: 1228 * 1229 * x1 = (2 - a * rcp(a)) * rcp(a) 1230 */ 1231 1232 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); 1233 LLVMValueRef rcp_a; 1234 LLVMValueRef res; 1235 1236 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); 1237 1238 res = LLVMBuildFMul(bld->builder, a, rcp_a, ""); 1239 res = LLVMBuildFSub(bld->builder, two, res, ""); 1240 res = LLVMBuildFMul(bld->builder, res, rcp_a, ""); 1241 1242 return rcp_a; 1243#else 1244 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); 1245#endif 1246 } 1247 1248 return LLVMBuildFDiv(bld->builder, bld->one, a, ""); 1249} 1250 1251 1252/** 1253 * Generate 1/sqrt(a) 1254 */ 1255LLVMValueRef 1256lp_build_rsqrt(struct lp_build_context *bld, 1257 LLVMValueRef a) 1258{ 1259 const struct lp_type type = bld->type; 1260 1261 assert(type.floating); 1262 1263 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 1264 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a); 1265 1266 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1267} 1268 1269 1270static inline LLVMValueRef 1271lp_build_const_v4si(unsigned long value) 1272{ 1273 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0); 1274 LLVMValueRef elements[4] = { element, element, element, element }; 1275 return LLVMConstVector(elements, 4); 1276} 1277 1278static inline LLVMValueRef 1279lp_build_const_v4sf(float value) 1280{ 1281 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value); 1282 LLVMValueRef elements[4] = { element, element, element, element }; 1283 return LLVMConstVector(elements, 4); 1284} 1285 1286 1287/** 1288 * Generate sin(a) using SSE2 1289 */ 1290LLVMValueRef 1291lp_build_sin(struct lp_build_context *bld, 1292 LLVMValueRef a) 1293{ 1294 struct lp_type int_type = lp_int_type(bld->type); 1295 LLVMBuilderRef b = bld->builder; 1296 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); 1297 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4); 1298 1299 /* 1300 * take the absolute value, 1301 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1302 */ 1303 1304 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000); 1305 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); 1306 1307 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1308 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); 1309 1310 /* 1311 * extract the sign bit (upper one) 1312 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 1313 */ 1314 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000); 1315 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 1316 1317 /* 1318 * scale by 4/Pi 1319 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1320 */ 1321 1322 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); 1323 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1324 1325 /* 1326 * store the integer part of y in mm0 1327 * emm2 = _mm_cvttps_epi32(y); 1328 */ 1329 1330 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); 1331 1332 /* 1333 * j=(j+1) & (~1) (see the cephes sources) 1334 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1335 */ 1336 1337 LLVMValueRef all_one = lp_build_const_v4si(1); 1338 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1339 /* 1340 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1341 */ 1342 LLVMValueRef inv_one = lp_build_const_v4si(~1); 1343 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1344 1345 /* 1346 * y = _mm_cvtepi32_ps(emm2); 1347 */ 1348 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); 1349 1350 /* get the swap sign flag 1351 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 1352 */ 1353 LLVMValueRef pi32_4 = lp_build_const_v4si(4); 1354 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 1355 1356 /* 1357 * emm2 = _mm_slli_epi32(emm0, 29); 1358 */ 1359 LLVMValueRef const_29 = lp_build_const_v4si(29); 1360 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 1361 1362 /* 1363 * get the polynom selection mask 1364 * there is one polynom for 0 <= x <= Pi/4 1365 * and another one for Pi/4<x<=Pi/2 1366 * Both branches will be computed. 1367 * 1368 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1369 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1370 */ 1371 1372 LLVMValueRef pi32_2 = lp_build_const_v4si(2); 1373 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 1374 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL, 1375 emm2_3, lp_build_const_v4si(0)); 1376 /* 1377 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 1378 */ 1379 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 1380 1381 /* 1382 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1383 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1384 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1385 */ 1386 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625); 1387 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4); 1388 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8); 1389 1390 /* 1391 * The magic pass: "Extended precision modular arithmetic" 1392 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1393 * xmm1 = _mm_mul_ps(y, xmm1); 1394 * xmm2 = _mm_mul_ps(y, xmm2); 1395 * xmm3 = _mm_mul_ps(y, xmm3); 1396 */ 1397 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1398 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1399 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1400 1401 /* 1402 * x = _mm_add_ps(x, xmm1); 1403 * x = _mm_add_ps(x, xmm2); 1404 * x = _mm_add_ps(x, xmm3); 1405 */ 1406 1407 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 1408 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 1409 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 1410 1411 /* 1412 * Evaluate the first polynom (0 <= x <= Pi/4) 1413 * 1414 * z = _mm_mul_ps(x,x); 1415 */ 1416 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 1417 1418 /* 1419 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1420 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1421 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1422 */ 1423 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005); 1424 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003); 1425 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002); 1426 1427 /* 1428 * y = *(v4sf*)_ps_coscof_p0; 1429 * y = _mm_mul_ps(y, z); 1430 */ 1431 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 1432 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 1433 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 1434 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 1435 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 1436 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 1437 1438 1439 /* 1440 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1441 * y = _mm_sub_ps(y, tmp); 1442 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1443 */ 1444 LLVMValueRef half = lp_build_const_v4sf(0.5); 1445 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 1446 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 1447 LLVMValueRef one = lp_build_const_v4sf(1.0); 1448 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 1449 1450 /* 1451 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1452 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1453 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1454 */ 1455 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4); 1456 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3); 1457 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1); 1458 1459 /* 1460 * Evaluate the second polynom (Pi/4 <= x <= 0) 1461 * 1462 * y2 = *(v4sf*)_ps_sincof_p0; 1463 * y2 = _mm_mul_ps(y2, z); 1464 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1465 * y2 = _mm_mul_ps(y2, z); 1466 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1467 * y2 = _mm_mul_ps(y2, z); 1468 * y2 = _mm_mul_ps(y2, x); 1469 * y2 = _mm_add_ps(y2, x); 1470 */ 1471 1472 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 1473 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 1474 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 1475 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 1476 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 1477 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 1478 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 1479 1480 /* 1481 * select the correct result from the two polynoms 1482 * xmm3 = poly_mask; 1483 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1484 * y = _mm_andnot_ps(xmm3, y); 1485 * y = _mm_add_ps(y,y2); 1486 */ 1487 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); 1488 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); 1489 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1490 LLVMValueRef inv = lp_build_const_v4si(~0); 1491 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1492 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1493 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1494 1495 /* 1496 * update the sign 1497 * y = _mm_xor_ps(y, sign_bit); 1498 */ 1499 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 1500 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); 1501 return y_result; 1502} 1503 1504 1505/** 1506 * Generate cos(a) using SSE2 1507 */ 1508LLVMValueRef 1509lp_build_cos(struct lp_build_context *bld, 1510 LLVMValueRef a) 1511{ 1512 struct lp_type int_type = lp_int_type(bld->type); 1513 LLVMBuilderRef b = bld->builder; 1514 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); 1515 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4); 1516 1517 /* 1518 * take the absolute value, 1519 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1520 */ 1521 1522 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000); 1523 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); 1524 1525 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1526 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); 1527 1528 /* 1529 * scale by 4/Pi 1530 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1531 */ 1532 1533 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); 1534 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1535 1536 /* 1537 * store the integer part of y in mm0 1538 * emm2 = _mm_cvttps_epi32(y); 1539 */ 1540 1541 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); 1542 1543 /* 1544 * j=(j+1) & (~1) (see the cephes sources) 1545 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1546 */ 1547 1548 LLVMValueRef all_one = lp_build_const_v4si(1); 1549 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1550 /* 1551 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1552 */ 1553 LLVMValueRef inv_one = lp_build_const_v4si(~1); 1554 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1555 1556 /* 1557 * y = _mm_cvtepi32_ps(emm2); 1558 */ 1559 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); 1560 1561 1562 /* 1563 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 1564 */ 1565 LLVMValueRef const_2 = lp_build_const_v4si(2); 1566 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 1567 1568 1569 /* get the swap sign flag 1570 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 1571 */ 1572 LLVMValueRef inv = lp_build_const_v4si(~0); 1573 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 1574 LLVMValueRef pi32_4 = lp_build_const_v4si(4); 1575 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 1576 1577 /* 1578 * emm2 = _mm_slli_epi32(emm0, 29); 1579 */ 1580 LLVMValueRef const_29 = lp_build_const_v4si(29); 1581 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 1582 1583 /* 1584 * get the polynom selection mask 1585 * there is one polynom for 0 <= x <= Pi/4 1586 * and another one for Pi/4<x<=Pi/2 1587 * Both branches will be computed. 1588 * 1589 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1590 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1591 */ 1592 1593 LLVMValueRef pi32_2 = lp_build_const_v4si(2); 1594 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 1595 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL, 1596 emm2_3, lp_build_const_v4si(0)); 1597 1598 /* 1599 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1600 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1601 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1602 */ 1603 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625); 1604 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4); 1605 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8); 1606 1607 /* 1608 * The magic pass: "Extended precision modular arithmetic" 1609 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1610 * xmm1 = _mm_mul_ps(y, xmm1); 1611 * xmm2 = _mm_mul_ps(y, xmm2); 1612 * xmm3 = _mm_mul_ps(y, xmm3); 1613 */ 1614 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1615 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1616 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1617 1618 /* 1619 * x = _mm_add_ps(x, xmm1); 1620 * x = _mm_add_ps(x, xmm2); 1621 * x = _mm_add_ps(x, xmm3); 1622 */ 1623 1624 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 1625 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 1626 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 1627 1628 /* 1629 * Evaluate the first polynom (0 <= x <= Pi/4) 1630 * 1631 * z = _mm_mul_ps(x,x); 1632 */ 1633 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 1634 1635 /* 1636 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1637 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1638 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1639 */ 1640 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005); 1641 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003); 1642 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002); 1643 1644 /* 1645 * y = *(v4sf*)_ps_coscof_p0; 1646 * y = _mm_mul_ps(y, z); 1647 */ 1648 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 1649 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 1650 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 1651 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 1652 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 1653 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 1654 1655 1656 /* 1657 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1658 * y = _mm_sub_ps(y, tmp); 1659 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1660 */ 1661 LLVMValueRef half = lp_build_const_v4sf(0.5); 1662 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 1663 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 1664 LLVMValueRef one = lp_build_const_v4sf(1.0); 1665 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 1666 1667 /* 1668 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1669 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1670 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1671 */ 1672 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4); 1673 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3); 1674 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1); 1675 1676 /* 1677 * Evaluate the second polynom (Pi/4 <= x <= 0) 1678 * 1679 * y2 = *(v4sf*)_ps_sincof_p0; 1680 * y2 = _mm_mul_ps(y2, z); 1681 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1682 * y2 = _mm_mul_ps(y2, z); 1683 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1684 * y2 = _mm_mul_ps(y2, z); 1685 * y2 = _mm_mul_ps(y2, x); 1686 * y2 = _mm_add_ps(y2, x); 1687 */ 1688 1689 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 1690 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 1691 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 1692 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 1693 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 1694 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 1695 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 1696 1697 /* 1698 * select the correct result from the two polynoms 1699 * xmm3 = poly_mask; 1700 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1701 * y = _mm_andnot_ps(xmm3, y); 1702 * y = _mm_add_ps(y,y2); 1703 */ 1704 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); 1705 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); 1706 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1707 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1708 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1709 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1710 1711 /* 1712 * update the sign 1713 * y = _mm_xor_ps(y, sign_bit); 1714 */ 1715 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 1716 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); 1717 return y_result; 1718} 1719 1720 1721/** 1722 * Generate pow(x, y) 1723 */ 1724LLVMValueRef 1725lp_build_pow(struct lp_build_context *bld, 1726 LLVMValueRef x, 1727 LLVMValueRef y) 1728{ 1729 /* TODO: optimize the constant case */ 1730 if(LLVMIsConstant(x) && LLVMIsConstant(y)) 1731 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1732 __FUNCTION__); 1733 1734 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 1735} 1736 1737 1738/** 1739 * Generate exp(x) 1740 */ 1741LLVMValueRef 1742lp_build_exp(struct lp_build_context *bld, 1743 LLVMValueRef x) 1744{ 1745 /* log2(e) = 1/log(2) */ 1746 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634); 1747 1748 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x)); 1749} 1750 1751 1752/** 1753 * Generate log(x) 1754 */ 1755LLVMValueRef 1756lp_build_log(struct lp_build_context *bld, 1757 LLVMValueRef x) 1758{ 1759 /* log(2) */ 1760 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529); 1761 1762 return lp_build_mul(bld, log2, lp_build_exp2(bld, x)); 1763} 1764 1765 1766#define EXP_POLY_DEGREE 3 1767#define LOG_POLY_DEGREE 5 1768 1769 1770/** 1771 * Generate polynomial. 1772 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 1773 */ 1774static LLVMValueRef 1775lp_build_polynomial(struct lp_build_context *bld, 1776 LLVMValueRef x, 1777 const double *coeffs, 1778 unsigned num_coeffs) 1779{ 1780 const struct lp_type type = bld->type; 1781 LLVMValueRef res = NULL; 1782 unsigned i; 1783 1784 /* TODO: optimize the constant case */ 1785 if(LLVMIsConstant(x)) 1786 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1787 __FUNCTION__); 1788 1789 for (i = num_coeffs; i--; ) { 1790 LLVMValueRef coeff; 1791 1792 coeff = lp_build_const_vec(type, coeffs[i]); 1793 1794 if(res) 1795 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); 1796 else 1797 res = coeff; 1798 } 1799 1800 if(res) 1801 return res; 1802 else 1803 return bld->undef; 1804} 1805 1806 1807/** 1808 * Minimax polynomial fit of 2**x, in range [0, 1[ 1809 */ 1810const double lp_build_exp2_polynomial[] = { 1811#if EXP_POLY_DEGREE == 5 1812 0.999999999690134838155, 1813 0.583974334321735217258, 1814 0.164553105719676828492, 1815 0.0292811063701710962255, 1816 0.00354944426657875141846, 1817 0.000296253726543423377365 1818#elif EXP_POLY_DEGREE == 4 1819 1.00000001502262084505, 1820 0.563586057338685991394, 1821 0.150436017652442413623, 1822 0.0243220604213317927308, 1823 0.0025359088446580436489 1824#elif EXP_POLY_DEGREE == 3 1825 0.999925218562710312959, 1826 0.695833540494823811697, 1827 0.226067155427249155588, 1828 0.0780245226406372992967 1829#elif EXP_POLY_DEGREE == 2 1830 1.00172476321474503578, 1831 0.657636275736077639316, 1832 0.33718943461968720704 1833#else 1834#error 1835#endif 1836}; 1837 1838 1839void 1840lp_build_exp2_approx(struct lp_build_context *bld, 1841 LLVMValueRef x, 1842 LLVMValueRef *p_exp2_int_part, 1843 LLVMValueRef *p_frac_part, 1844 LLVMValueRef *p_exp2) 1845{ 1846 const struct lp_type type = bld->type; 1847 LLVMTypeRef vec_type = lp_build_vec_type(type); 1848 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1849 LLVMValueRef ipart = NULL; 1850 LLVMValueRef fpart = NULL; 1851 LLVMValueRef expipart = NULL; 1852 LLVMValueRef expfpart = NULL; 1853 LLVMValueRef res = NULL; 1854 1855 if(p_exp2_int_part || p_frac_part || p_exp2) { 1856 /* TODO: optimize the constant case */ 1857 if(LLVMIsConstant(x)) 1858 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1859 __FUNCTION__); 1860 1861 assert(type.floating && type.width == 32); 1862 1863 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0)); 1864 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999)); 1865 1866 /* ipart = floor(x) */ 1867 ipart = lp_build_floor(bld, x); 1868 1869 /* fpart = x - ipart */ 1870 fpart = LLVMBuildFSub(bld->builder, x, ipart, ""); 1871 } 1872 1873 if(p_exp2_int_part || p_exp2) { 1874 /* expipart = (float) (1 << ipart) */ 1875 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, ""); 1876 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), ""); 1877 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), ""); 1878 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, ""); 1879 } 1880 1881 if(p_exp2) { 1882 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 1883 Elements(lp_build_exp2_polynomial)); 1884 1885 res = LLVMBuildFMul(bld->builder, expipart, expfpart, ""); 1886 } 1887 1888 if(p_exp2_int_part) 1889 *p_exp2_int_part = expipart; 1890 1891 if(p_frac_part) 1892 *p_frac_part = fpart; 1893 1894 if(p_exp2) 1895 *p_exp2 = res; 1896} 1897 1898 1899LLVMValueRef 1900lp_build_exp2(struct lp_build_context *bld, 1901 LLVMValueRef x) 1902{ 1903 LLVMValueRef res; 1904 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 1905 return res; 1906} 1907 1908 1909/** 1910 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 1911 * These coefficients can be generate with 1912 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 1913 */ 1914const double lp_build_log2_polynomial[] = { 1915#if LOG_POLY_DEGREE == 6 1916 3.11578814719469302614, 1917 -3.32419399085241980044, 1918 2.59883907202499966007, 1919 -1.23152682416275988241, 1920 0.318212422185251071475, 1921 -0.0344359067839062357313 1922#elif LOG_POLY_DEGREE == 5 1923 2.8882704548164776201, 1924 -2.52074962577807006663, 1925 1.48116647521213171641, 1926 -0.465725644288844778798, 1927 0.0596515482674574969533 1928#elif LOG_POLY_DEGREE == 4 1929 2.61761038894603480148, 1930 -1.75647175389045657003, 1931 0.688243882994381274313, 1932 -0.107254423828329604454 1933#elif LOG_POLY_DEGREE == 3 1934 2.28330284476918490682, 1935 -1.04913055217340124191, 1936 0.204446009836232697516 1937#else 1938#error 1939#endif 1940}; 1941 1942 1943/** 1944 * See http://www.devmaster.net/forums/showthread.php?p=43580 1945 */ 1946void 1947lp_build_log2_approx(struct lp_build_context *bld, 1948 LLVMValueRef x, 1949 LLVMValueRef *p_exp, 1950 LLVMValueRef *p_floor_log2, 1951 LLVMValueRef *p_log2) 1952{ 1953 const struct lp_type type = bld->type; 1954 LLVMTypeRef vec_type = lp_build_vec_type(type); 1955 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); 1956 1957 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000); 1958 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff); 1959 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 1960 1961 LLVMValueRef i = NULL; 1962 LLVMValueRef exp = NULL; 1963 LLVMValueRef mant = NULL; 1964 LLVMValueRef logexp = NULL; 1965 LLVMValueRef logmant = NULL; 1966 LLVMValueRef res = NULL; 1967 1968 if(p_exp || p_floor_log2 || p_log2) { 1969 /* TODO: optimize the constant case */ 1970 if(LLVMIsConstant(x)) 1971 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 1972 __FUNCTION__); 1973 1974 assert(type.floating && type.width == 32); 1975 1976 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); 1977 1978 /* exp = (float) exponent(x) */ 1979 exp = LLVMBuildAnd(bld->builder, i, expmask, ""); 1980 } 1981 1982 if(p_floor_log2 || p_log2) { 1983 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), ""); 1984 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), ""); 1985 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, ""); 1986 } 1987 1988 if(p_log2) { 1989 /* mant = (float) mantissa(x) */ 1990 mant = LLVMBuildAnd(bld->builder, i, mantmask, ""); 1991 mant = LLVMBuildOr(bld->builder, mant, one, ""); 1992 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, ""); 1993 1994 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, 1995 Elements(lp_build_log2_polynomial)); 1996 1997 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ 1998 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), ""); 1999 2000 res = LLVMBuildFAdd(bld->builder, logmant, logexp, ""); 2001 } 2002 2003 if(p_exp) { 2004 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, ""); 2005 *p_exp = exp; 2006 } 2007 2008 if(p_floor_log2) 2009 *p_floor_log2 = logexp; 2010 2011 if(p_log2) 2012 *p_log2 = res; 2013} 2014 2015 2016LLVMValueRef 2017lp_build_log2(struct lp_build_context *bld, 2018 LLVMValueRef x) 2019{ 2020 LLVMValueRef res; 2021 lp_build_log2_approx(bld, x, NULL, NULL, &res); 2022 return res; 2023} 2024