lp_bld_arit.c revision da5e9fce47b2029c6f6445ed53f3b5e5ff3889a0
1/************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include "util/u_memory.h" 49#include "util/u_debug.h" 50#include "util/u_math.h" 51#include "util/u_string.h" 52#include "util/u_cpu_detect.h" 53 54#include "lp_bld_type.h" 55#include "lp_bld_const.h" 56#include "lp_bld_init.h" 57#include "lp_bld_intr.h" 58#include "lp_bld_logic.h" 59#include "lp_bld_pack.h" 60#include "lp_bld_debug.h" 61#include "lp_bld_arit.h" 62 63 64#define EXP_POLY_DEGREE 5 65 66#define LOG_POLY_DEGREE 5 67 68 69/** 70 * Generate min(a, b) 71 * No checks for special case values of a or b = 1 or 0 are done. 72 */ 73static LLVMValueRef 74lp_build_min_simple(struct lp_build_context *bld, 75 LLVMValueRef a, 76 LLVMValueRef b) 77{ 78 LLVMBuilderRef builder = bld->gallivm->builder; 79 const struct lp_type type = bld->type; 80 const char *intrinsic = NULL; 81 LLVMValueRef cond; 82 83 assert(lp_check_value(type, a)); 84 assert(lp_check_value(type, b)); 85 86 /* TODO: optimize the constant case */ 87 88 if(type.width * type.length == 128) { 89 if(type.floating) { 90 if(type.width == 32 && util_cpu_caps.has_sse) 91 intrinsic = "llvm.x86.sse.min.ps"; 92 if(type.width == 64 && util_cpu_caps.has_sse2) 93 intrinsic = "llvm.x86.sse2.min.pd"; 94 } 95 else { 96 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 97 intrinsic = "llvm.x86.sse2.pminu.b"; 98 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 99 intrinsic = "llvm.x86.sse41.pminsb"; 100 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 101 intrinsic = "llvm.x86.sse41.pminuw"; 102 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 103 intrinsic = "llvm.x86.sse2.pmins.w"; 104 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 105 intrinsic = "llvm.x86.sse41.pminud"; 106 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 107 intrinsic = "llvm.x86.sse41.pminsd"; 108 } 109 } 110 111 if(intrinsic) 112 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 113 114 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 115 return lp_build_select(bld, cond, a, b); 116} 117 118 119/** 120 * Generate max(a, b) 121 * No checks for special case values of a or b = 1 or 0 are done. 122 */ 123static LLVMValueRef 124lp_build_max_simple(struct lp_build_context *bld, 125 LLVMValueRef a, 126 LLVMValueRef b) 127{ 128 LLVMBuilderRef builder = bld->gallivm->builder; 129 const struct lp_type type = bld->type; 130 const char *intrinsic = NULL; 131 LLVMValueRef cond; 132 133 assert(lp_check_value(type, a)); 134 assert(lp_check_value(type, b)); 135 136 /* TODO: optimize the constant case */ 137 138 if(type.width * type.length == 128) { 139 if(type.floating) { 140 if(type.width == 32 && util_cpu_caps.has_sse) 141 intrinsic = "llvm.x86.sse.max.ps"; 142 if(type.width == 64 && util_cpu_caps.has_sse2) 143 intrinsic = "llvm.x86.sse2.max.pd"; 144 } 145 else { 146 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 147 intrinsic = "llvm.x86.sse2.pmaxu.b"; 148 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 149 intrinsic = "llvm.x86.sse41.pmaxsb"; 150 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 151 intrinsic = "llvm.x86.sse41.pmaxuw"; 152 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 153 intrinsic = "llvm.x86.sse2.pmaxs.w"; 154 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 155 intrinsic = "llvm.x86.sse41.pmaxud"; 156 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 157 intrinsic = "llvm.x86.sse41.pmaxsd"; 158 } 159 } 160 161 if(intrinsic) 162 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 163 164 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 165 return lp_build_select(bld, cond, a, b); 166} 167 168 169/** 170 * Generate 1 - a, or ~a depending on bld->type. 171 */ 172LLVMValueRef 173lp_build_comp(struct lp_build_context *bld, 174 LLVMValueRef a) 175{ 176 LLVMBuilderRef builder = bld->gallivm->builder; 177 const struct lp_type type = bld->type; 178 179 assert(lp_check_value(type, a)); 180 181 if(a == bld->one) 182 return bld->zero; 183 if(a == bld->zero) 184 return bld->one; 185 186 if(type.norm && !type.floating && !type.fixed && !type.sign) { 187 if(LLVMIsConstant(a)) 188 return LLVMConstNot(a); 189 else 190 return LLVMBuildNot(builder, a, ""); 191 } 192 193 if(LLVMIsConstant(a)) 194 if (type.floating) 195 return LLVMConstFSub(bld->one, a); 196 else 197 return LLVMConstSub(bld->one, a); 198 else 199 if (type.floating) 200 return LLVMBuildFSub(builder, bld->one, a, ""); 201 else 202 return LLVMBuildSub(builder, bld->one, a, ""); 203} 204 205 206/** 207 * Generate a + b 208 */ 209LLVMValueRef 210lp_build_add(struct lp_build_context *bld, 211 LLVMValueRef a, 212 LLVMValueRef b) 213{ 214 LLVMBuilderRef builder = bld->gallivm->builder; 215 const struct lp_type type = bld->type; 216 LLVMValueRef res; 217 218 assert(lp_check_value(type, a)); 219 assert(lp_check_value(type, b)); 220 221 if(a == bld->zero) 222 return b; 223 if(b == bld->zero) 224 return a; 225 if(a == bld->undef || b == bld->undef) 226 return bld->undef; 227 228 if(bld->type.norm) { 229 const char *intrinsic = NULL; 230 231 if(a == bld->one || b == bld->one) 232 return bld->one; 233 234 if(util_cpu_caps.has_sse2 && 235 type.width * type.length == 128 && 236 !type.floating && !type.fixed) { 237 if(type.width == 8) 238 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 239 if(type.width == 16) 240 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 241 } 242 243 if(intrinsic) 244 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 245 } 246 247 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 248 if (type.floating) 249 res = LLVMConstFAdd(a, b); 250 else 251 res = LLVMConstAdd(a, b); 252 else 253 if (type.floating) 254 res = LLVMBuildFAdd(builder, a, b, ""); 255 else 256 res = LLVMBuildAdd(builder, a, b, ""); 257 258 /* clamp to ceiling of 1.0 */ 259 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 260 res = lp_build_min_simple(bld, res, bld->one); 261 262 /* XXX clamp to floor of -1 or 0??? */ 263 264 return res; 265} 266 267 268/** Return the scalar sum of the elements of a */ 269LLVMValueRef 270lp_build_sum_vector(struct lp_build_context *bld, 271 LLVMValueRef a) 272{ 273 LLVMBuilderRef builder = bld->gallivm->builder; 274 const struct lp_type type = bld->type; 275 LLVMValueRef index, res; 276 unsigned i; 277 278 assert(lp_check_value(type, a)); 279 280 if (type.length == 1) { 281 return a; 282 } 283 284 assert(!bld->type.norm); 285 286 index = lp_build_const_int32(bld->gallivm, 0); 287 res = LLVMBuildExtractElement(builder, a, index, ""); 288 289 for (i = 1; i < type.length; i++) { 290 index = lp_build_const_int32(bld->gallivm, i); 291 if (type.floating) 292 res = LLVMBuildFAdd(builder, res, 293 LLVMBuildExtractElement(builder, 294 a, index, ""), 295 ""); 296 else 297 res = LLVMBuildAdd(builder, res, 298 LLVMBuildExtractElement(builder, 299 a, index, ""), 300 ""); 301 } 302 303 return res; 304} 305 306 307/** 308 * Generate a - b 309 */ 310LLVMValueRef 311lp_build_sub(struct lp_build_context *bld, 312 LLVMValueRef a, 313 LLVMValueRef b) 314{ 315 LLVMBuilderRef builder = bld->gallivm->builder; 316 const struct lp_type type = bld->type; 317 LLVMValueRef res; 318 319 assert(lp_check_value(type, a)); 320 assert(lp_check_value(type, b)); 321 322 if(b == bld->zero) 323 return a; 324 if(a == bld->undef || b == bld->undef) 325 return bld->undef; 326 if(a == b) 327 return bld->zero; 328 329 if(bld->type.norm) { 330 const char *intrinsic = NULL; 331 332 if(b == bld->one) 333 return bld->zero; 334 335 if(util_cpu_caps.has_sse2 && 336 type.width * type.length == 128 && 337 !type.floating && !type.fixed) { 338 if(type.width == 8) 339 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 340 if(type.width == 16) 341 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 342 } 343 344 if(intrinsic) 345 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 346 } 347 348 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 349 if (type.floating) 350 res = LLVMConstFSub(a, b); 351 else 352 res = LLVMConstSub(a, b); 353 else 354 if (type.floating) 355 res = LLVMBuildFSub(builder, a, b, ""); 356 else 357 res = LLVMBuildSub(builder, a, b, ""); 358 359 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 360 res = lp_build_max_simple(bld, res, bld->zero); 361 362 return res; 363} 364 365 366/** 367 * Normalized 8bit multiplication. 368 * 369 * - alpha plus one 370 * 371 * makes the following approximation to the division (Sree) 372 * 373 * a*b/255 ~= (a*(b + 1)) >> 256 374 * 375 * which is the fastest method that satisfies the following OpenGL criteria 376 * 377 * 0*0 = 0 and 255*255 = 255 378 * 379 * - geometric series 380 * 381 * takes the geometric series approximation to the division 382 * 383 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 384 * 385 * in this case just the first two terms to fit in 16bit arithmetic 386 * 387 * t/255 ~= (t + (t >> 8)) >> 8 388 * 389 * note that just by itself it doesn't satisfies the OpenGL criteria, as 390 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 391 * must be used 392 * 393 * - geometric series plus rounding 394 * 395 * when using a geometric series division instead of truncating the result 396 * use roundoff in the approximation (Jim Blinn) 397 * 398 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 399 * 400 * achieving the exact results 401 * 402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 403 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 404 * @sa Michael Herf, The "double blend trick", May 2000, 405 * http://www.stereopsis.com/doubleblend.html 406 */ 407static LLVMValueRef 408lp_build_mul_u8n(struct gallivm_state *gallivm, 409 struct lp_type i16_type, 410 LLVMValueRef a, LLVMValueRef b) 411{ 412 LLVMBuilderRef builder = gallivm->builder; 413 LLVMValueRef c8; 414 LLVMValueRef ab; 415 416 assert(!i16_type.floating); 417 assert(lp_check_value(i16_type, a)); 418 assert(lp_check_value(i16_type, b)); 419 420 c8 = lp_build_const_int_vec(gallivm, i16_type, 8); 421 422#if 0 423 424 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 425 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), ""); 426 ab = LLVMBuildMul(builder, a, b, ""); 427 428#else 429 430 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 431 ab = LLVMBuildMul(builder, a, b, ""); 432 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 433 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), ""); 434 435#endif 436 437 ab = LLVMBuildLShr(builder, ab, c8, ""); 438 439 return ab; 440} 441 442 443/** 444 * Generate a * b 445 */ 446LLVMValueRef 447lp_build_mul(struct lp_build_context *bld, 448 LLVMValueRef a, 449 LLVMValueRef b) 450{ 451 LLVMBuilderRef builder = bld->gallivm->builder; 452 const struct lp_type type = bld->type; 453 LLVMValueRef shift; 454 LLVMValueRef res; 455 456 assert(lp_check_value(type, a)); 457 assert(lp_check_value(type, b)); 458 459 if(a == bld->zero) 460 return bld->zero; 461 if(a == bld->one) 462 return b; 463 if(b == bld->zero) 464 return bld->zero; 465 if(b == bld->one) 466 return a; 467 if(a == bld->undef || b == bld->undef) 468 return bld->undef; 469 470 if(!type.floating && !type.fixed && type.norm) { 471 if(type.width == 8) { 472 struct lp_type i16_type = lp_wider_type(type); 473 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 474 475 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah); 476 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh); 477 478 /* PMULLW, PSRLW, PADDW */ 479 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl); 480 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh); 481 482 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh); 483 484 return ab; 485 } 486 487 /* FIXME */ 488 assert(0); 489 } 490 491 if(type.fixed) 492 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 493 else 494 shift = NULL; 495 496 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 497 if (type.floating) 498 res = LLVMConstFMul(a, b); 499 else 500 res = LLVMConstMul(a, b); 501 if(shift) { 502 if(type.sign) 503 res = LLVMConstAShr(res, shift); 504 else 505 res = LLVMConstLShr(res, shift); 506 } 507 } 508 else { 509 if (type.floating) 510 res = LLVMBuildFMul(builder, a, b, ""); 511 else 512 res = LLVMBuildMul(builder, a, b, ""); 513 if(shift) { 514 if(type.sign) 515 res = LLVMBuildAShr(builder, res, shift, ""); 516 else 517 res = LLVMBuildLShr(builder, res, shift, ""); 518 } 519 } 520 521 return res; 522} 523 524 525/** 526 * Small vector x scale multiplication optimization. 527 */ 528LLVMValueRef 529lp_build_mul_imm(struct lp_build_context *bld, 530 LLVMValueRef a, 531 int b) 532{ 533 LLVMBuilderRef builder = bld->gallivm->builder; 534 LLVMValueRef factor; 535 536 assert(lp_check_value(bld->type, a)); 537 538 if(b == 0) 539 return bld->zero; 540 541 if(b == 1) 542 return a; 543 544 if(b == -1) 545 return lp_build_negate(bld, a); 546 547 if(b == 2 && bld->type.floating) 548 return lp_build_add(bld, a, a); 549 550 if(util_is_power_of_two(b)) { 551 unsigned shift = ffs(b) - 1; 552 553 if(bld->type.floating) { 554#if 0 555 /* 556 * Power of two multiplication by directly manipulating the mantissa. 557 * 558 * XXX: This might not be always faster, it will introduce a small error 559 * for multiplication by zero, and it will produce wrong results 560 * for Inf and NaN. 561 */ 562 unsigned mantissa = lp_mantissa(bld->type); 563 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 564 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 565 a = LLVMBuildAdd(builder, a, factor, ""); 566 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 567 return a; 568#endif 569 } 570 else { 571 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 572 return LLVMBuildShl(builder, a, factor, ""); 573 } 574 } 575 576 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 577 return lp_build_mul(bld, a, factor); 578} 579 580 581/** 582 * Generate a / b 583 */ 584LLVMValueRef 585lp_build_div(struct lp_build_context *bld, 586 LLVMValueRef a, 587 LLVMValueRef b) 588{ 589 LLVMBuilderRef builder = bld->gallivm->builder; 590 const struct lp_type type = bld->type; 591 592 assert(lp_check_value(type, a)); 593 assert(lp_check_value(type, b)); 594 595 if(a == bld->zero) 596 return bld->zero; 597 if(a == bld->one) 598 return lp_build_rcp(bld, b); 599 if(b == bld->zero) 600 return bld->undef; 601 if(b == bld->one) 602 return a; 603 if(a == bld->undef || b == bld->undef) 604 return bld->undef; 605 606 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 607 if (type.floating) 608 return LLVMConstFDiv(a, b); 609 else if (type.sign) 610 return LLVMConstSDiv(a, b); 611 else 612 return LLVMConstUDiv(a, b); 613 } 614 615 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 && 616 type.floating) 617 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 618 619 if (type.floating) 620 return LLVMBuildFDiv(builder, a, b, ""); 621 else if (type.sign) 622 return LLVMBuildSDiv(builder, a, b, ""); 623 else 624 return LLVMBuildUDiv(builder, a, b, ""); 625} 626 627 628/** 629 * Linear interpolation -- without any checks. 630 * 631 * @sa http://www.stereopsis.com/doubleblend.html 632 */ 633static INLINE LLVMValueRef 634lp_build_lerp_simple(struct lp_build_context *bld, 635 LLVMValueRef x, 636 LLVMValueRef v0, 637 LLVMValueRef v1) 638{ 639 LLVMBuilderRef builder = bld->gallivm->builder; 640 LLVMValueRef delta; 641 LLVMValueRef res; 642 643 assert(lp_check_value(bld->type, x)); 644 assert(lp_check_value(bld->type, v0)); 645 assert(lp_check_value(bld->type, v1)); 646 647 delta = lp_build_sub(bld, v1, v0); 648 649 res = lp_build_mul(bld, x, delta); 650 651 res = lp_build_add(bld, v0, res); 652 653 if (bld->type.fixed) { 654 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 655 * but it will be wrong for other uses. Basically we need a more 656 * powerful lp_type, capable of further distinguishing the values 657 * interpretation from the value storage. */ 658 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), ""); 659 } 660 661 return res; 662} 663 664 665/** 666 * Linear interpolation. 667 */ 668LLVMValueRef 669lp_build_lerp(struct lp_build_context *bld, 670 LLVMValueRef x, 671 LLVMValueRef v0, 672 LLVMValueRef v1) 673{ 674 LLVMBuilderRef builder = bld->gallivm->builder; 675 const struct lp_type type = bld->type; 676 LLVMValueRef res; 677 678 assert(lp_check_value(type, x)); 679 assert(lp_check_value(type, v0)); 680 assert(lp_check_value(type, v1)); 681 682 if (type.norm) { 683 struct lp_type wide_type; 684 struct lp_build_context wide_bld; 685 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 686 LLVMValueRef shift; 687 688 assert(type.length >= 2); 689 assert(!type.sign); 690 691 /* 692 * Create a wider type, enough to hold the intermediate result of the 693 * multiplication. 694 */ 695 memset(&wide_type, 0, sizeof wide_type); 696 wide_type.fixed = TRUE; 697 wide_type.width = type.width*2; 698 wide_type.length = type.length/2; 699 700 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 701 702 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); 703 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 704 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 705 706 /* 707 * Scale x from [0, 255] to [0, 256] 708 */ 709 710 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1); 711 712 xl = lp_build_add(&wide_bld, xl, 713 LLVMBuildAShr(builder, xl, shift, "")); 714 xh = lp_build_add(&wide_bld, xh, 715 LLVMBuildAShr(builder, xh, shift, "")); 716 717 /* 718 * Lerp both halves. 719 */ 720 721 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); 722 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); 723 724 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); 725 } else { 726 res = lp_build_lerp_simple(bld, x, v0, v1); 727 } 728 729 return res; 730} 731 732 733LLVMValueRef 734lp_build_lerp_2d(struct lp_build_context *bld, 735 LLVMValueRef x, 736 LLVMValueRef y, 737 LLVMValueRef v00, 738 LLVMValueRef v01, 739 LLVMValueRef v10, 740 LLVMValueRef v11) 741{ 742 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 743 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 744 return lp_build_lerp(bld, y, v0, v1); 745} 746 747 748/** 749 * Generate min(a, b) 750 * Do checks for special cases. 751 */ 752LLVMValueRef 753lp_build_min(struct lp_build_context *bld, 754 LLVMValueRef a, 755 LLVMValueRef b) 756{ 757 assert(lp_check_value(bld->type, a)); 758 assert(lp_check_value(bld->type, b)); 759 760 if(a == bld->undef || b == bld->undef) 761 return bld->undef; 762 763 if(a == b) 764 return a; 765 766 if(bld->type.norm) { 767 if(a == bld->zero || b == bld->zero) 768 return bld->zero; 769 if(a == bld->one) 770 return b; 771 if(b == bld->one) 772 return a; 773 } 774 775 return lp_build_min_simple(bld, a, b); 776} 777 778 779/** 780 * Generate max(a, b) 781 * Do checks for special cases. 782 */ 783LLVMValueRef 784lp_build_max(struct lp_build_context *bld, 785 LLVMValueRef a, 786 LLVMValueRef b) 787{ 788 assert(lp_check_value(bld->type, a)); 789 assert(lp_check_value(bld->type, b)); 790 791 if(a == bld->undef || b == bld->undef) 792 return bld->undef; 793 794 if(a == b) 795 return a; 796 797 if(bld->type.norm) { 798 if(a == bld->one || b == bld->one) 799 return bld->one; 800 if(a == bld->zero) 801 return b; 802 if(b == bld->zero) 803 return a; 804 } 805 806 return lp_build_max_simple(bld, a, b); 807} 808 809 810/** 811 * Generate clamp(a, min, max) 812 * Do checks for special cases. 813 */ 814LLVMValueRef 815lp_build_clamp(struct lp_build_context *bld, 816 LLVMValueRef a, 817 LLVMValueRef min, 818 LLVMValueRef max) 819{ 820 assert(lp_check_value(bld->type, a)); 821 assert(lp_check_value(bld->type, min)); 822 assert(lp_check_value(bld->type, max)); 823 824 a = lp_build_min(bld, a, max); 825 a = lp_build_max(bld, a, min); 826 return a; 827} 828 829 830/** 831 * Generate abs(a) 832 */ 833LLVMValueRef 834lp_build_abs(struct lp_build_context *bld, 835 LLVMValueRef a) 836{ 837 LLVMBuilderRef builder = bld->gallivm->builder; 838 const struct lp_type type = bld->type; 839 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 840 841 assert(lp_check_value(type, a)); 842 843 if(!type.sign) 844 return a; 845 846 if(type.floating) { 847 /* Mask out the sign bit */ 848 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 849 unsigned long long absMask = ~(1ULL << (type.width - 1)); 850 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 851 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 852 a = LLVMBuildAnd(builder, a, mask, ""); 853 a = LLVMBuildBitCast(builder, a, vec_type, ""); 854 return a; 855 } 856 857 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 858 switch(type.width) { 859 case 8: 860 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 861 case 16: 862 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 863 case 32: 864 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 865 } 866 } 867 868 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 869} 870 871 872LLVMValueRef 873lp_build_negate(struct lp_build_context *bld, 874 LLVMValueRef a) 875{ 876 LLVMBuilderRef builder = bld->gallivm->builder; 877 878 assert(lp_check_value(bld->type, a)); 879 880#if HAVE_LLVM >= 0x0207 881 if (bld->type.floating) 882 a = LLVMBuildFNeg(builder, a, ""); 883 else 884#endif 885 a = LLVMBuildNeg(builder, a, ""); 886 887 return a; 888} 889 890 891/** Return -1, 0 or +1 depending on the sign of a */ 892LLVMValueRef 893lp_build_sgn(struct lp_build_context *bld, 894 LLVMValueRef a) 895{ 896 LLVMBuilderRef builder = bld->gallivm->builder; 897 const struct lp_type type = bld->type; 898 LLVMValueRef cond; 899 LLVMValueRef res; 900 901 assert(lp_check_value(type, a)); 902 903 /* Handle non-zero case */ 904 if(!type.sign) { 905 /* if not zero then sign must be positive */ 906 res = bld->one; 907 } 908 else if(type.floating) { 909 LLVMTypeRef vec_type; 910 LLVMTypeRef int_type; 911 LLVMValueRef mask; 912 LLVMValueRef sign; 913 LLVMValueRef one; 914 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 915 916 int_type = lp_build_int_vec_type(bld->gallivm, type); 917 vec_type = lp_build_vec_type(bld->gallivm, type); 918 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 919 920 /* Take the sign bit and add it to 1 constant */ 921 sign = LLVMBuildBitCast(builder, a, int_type, ""); 922 sign = LLVMBuildAnd(builder, sign, mask, ""); 923 one = LLVMConstBitCast(bld->one, int_type); 924 res = LLVMBuildOr(builder, sign, one, ""); 925 res = LLVMBuildBitCast(builder, res, vec_type, ""); 926 } 927 else 928 { 929 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 930 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 931 res = lp_build_select(bld, cond, bld->one, minus_one); 932 } 933 934 /* Handle zero */ 935 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 936 res = lp_build_select(bld, cond, bld->zero, res); 937 938 return res; 939} 940 941 942/** 943 * Set the sign of float vector 'a' according to 'sign'. 944 * If sign==0, return abs(a). 945 * If sign==1, return -abs(a); 946 * Other values for sign produce undefined results. 947 */ 948LLVMValueRef 949lp_build_set_sign(struct lp_build_context *bld, 950 LLVMValueRef a, LLVMValueRef sign) 951{ 952 LLVMBuilderRef builder = bld->gallivm->builder; 953 const struct lp_type type = bld->type; 954 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 955 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 956 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 957 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 958 ~((unsigned long long) 1 << (type.width - 1))); 959 LLVMValueRef val, res; 960 961 assert(type.floating); 962 assert(lp_check_value(type, a)); 963 964 /* val = reinterpret_cast<int>(a) */ 965 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 966 /* val = val & mask */ 967 val = LLVMBuildAnd(builder, val, mask, ""); 968 /* sign = sign << shift */ 969 sign = LLVMBuildShl(builder, sign, shift, ""); 970 /* res = val | sign */ 971 res = LLVMBuildOr(builder, val, sign, ""); 972 /* res = reinterpret_cast<float>(res) */ 973 res = LLVMBuildBitCast(builder, res, vec_type, ""); 974 975 return res; 976} 977 978 979/** 980 * Convert vector of (or scalar) int to vector of (or scalar) float. 981 */ 982LLVMValueRef 983lp_build_int_to_float(struct lp_build_context *bld, 984 LLVMValueRef a) 985{ 986 LLVMBuilderRef builder = bld->gallivm->builder; 987 const struct lp_type type = bld->type; 988 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 989 990 assert(type.floating); 991 992 return LLVMBuildSIToFP(builder, a, vec_type, ""); 993} 994 995 996 997enum lp_build_round_sse41_mode 998{ 999 LP_BUILD_ROUND_SSE41_NEAREST = 0, 1000 LP_BUILD_ROUND_SSE41_FLOOR = 1, 1001 LP_BUILD_ROUND_SSE41_CEIL = 2, 1002 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 1003}; 1004 1005 1006/** 1007 * Helper for SSE4.1's ROUNDxx instructions. 1008 * 1009 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the 1010 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. 1011 */ 1012static INLINE LLVMValueRef 1013lp_build_round_sse41(struct lp_build_context *bld, 1014 LLVMValueRef a, 1015 enum lp_build_round_sse41_mode mode) 1016{ 1017 LLVMBuilderRef builder = bld->gallivm->builder; 1018 const struct lp_type type = bld->type; 1019 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1020 const char *intrinsic; 1021 LLVMValueRef res; 1022 1023 assert(type.floating); 1024 1025 assert(lp_check_value(type, a)); 1026 assert(util_cpu_caps.has_sse4_1); 1027 1028 if (type.length == 1) { 1029 LLVMTypeRef vec_type; 1030 LLVMValueRef undef; 1031 LLVMValueRef args[3]; 1032 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1033 1034 switch(type.width) { 1035 case 32: 1036 intrinsic = "llvm.x86.sse41.round.ss"; 1037 break; 1038 case 64: 1039 intrinsic = "llvm.x86.sse41.round.sd"; 1040 break; 1041 default: 1042 assert(0); 1043 return bld->undef; 1044 } 1045 1046 vec_type = LLVMVectorType(bld->elem_type, 4); 1047 1048 undef = LLVMGetUndef(vec_type); 1049 1050 args[0] = undef; 1051 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1052 args[2] = LLVMConstInt(i32t, mode, 0); 1053 1054 res = lp_build_intrinsic(builder, intrinsic, 1055 vec_type, args, Elements(args)); 1056 1057 res = LLVMBuildExtractElement(builder, res, index0, ""); 1058 } 1059 else { 1060 assert(type.width*type.length == 128); 1061 1062 switch(type.width) { 1063 case 32: 1064 intrinsic = "llvm.x86.sse41.round.ps"; 1065 break; 1066 case 64: 1067 intrinsic = "llvm.x86.sse41.round.pd"; 1068 break; 1069 default: 1070 assert(0); 1071 return bld->undef; 1072 } 1073 1074 res = lp_build_intrinsic_binary(builder, intrinsic, 1075 bld->vec_type, a, 1076 LLVMConstInt(i32t, mode, 0)); 1077 } 1078 1079 return res; 1080} 1081 1082 1083static INLINE LLVMValueRef 1084lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1085 LLVMValueRef a) 1086{ 1087 LLVMBuilderRef builder = bld->gallivm->builder; 1088 const struct lp_type type = bld->type; 1089 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1090 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1091 const char *intrinsic; 1092 LLVMValueRef res; 1093 1094 assert(type.floating); 1095 /* using the double precision conversions is a bit more complicated */ 1096 assert(type.width == 32); 1097 1098 assert(lp_check_value(type, a)); 1099 assert(util_cpu_caps.has_sse2); 1100 1101 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1102 if (type.length == 1) { 1103 LLVMTypeRef vec_type; 1104 LLVMValueRef undef; 1105 LLVMValueRef arg; 1106 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1107 1108 vec_type = LLVMVectorType(bld->elem_type, 4); 1109 1110 intrinsic = "llvm.x86.sse.cvtss2si"; 1111 1112 undef = LLVMGetUndef(vec_type); 1113 1114 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1115 1116 res = lp_build_intrinsic_unary(builder, intrinsic, 1117 ret_type, arg); 1118 } 1119 else { 1120 assert(type.width*type.length == 128); 1121 1122 intrinsic = "llvm.x86.sse2.cvtps2dq"; 1123 1124 res = lp_build_intrinsic_unary(builder, intrinsic, 1125 ret_type, a); 1126 } 1127 1128 return res; 1129} 1130 1131 1132/** 1133 * Return the integer part of a float (vector) value (== round toward zero). 1134 * The returned value is a float (vector). 1135 * Ex: trunc(-1.5) = -1.0 1136 */ 1137LLVMValueRef 1138lp_build_trunc(struct lp_build_context *bld, 1139 LLVMValueRef a) 1140{ 1141 LLVMBuilderRef builder = bld->gallivm->builder; 1142 const struct lp_type type = bld->type; 1143 1144 assert(type.floating); 1145 assert(lp_check_value(type, a)); 1146 1147 if (util_cpu_caps.has_sse4_1 && 1148 (type.length == 1 || type.width*type.length == 128)) { 1149 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 1150 } 1151 else { 1152 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1153 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1154 LLVMValueRef res; 1155 res = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1156 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1157 return res; 1158 } 1159} 1160 1161 1162/** 1163 * Return float (vector) rounded to nearest integer (vector). The returned 1164 * value is a float (vector). 1165 * Ex: round(0.9) = 1.0 1166 * Ex: round(-1.5) = -2.0 1167 */ 1168LLVMValueRef 1169lp_build_round(struct lp_build_context *bld, 1170 LLVMValueRef a) 1171{ 1172 LLVMBuilderRef builder = bld->gallivm->builder; 1173 const struct lp_type type = bld->type; 1174 1175 assert(type.floating); 1176 assert(lp_check_value(type, a)); 1177 1178 if (util_cpu_caps.has_sse4_1 && 1179 (type.length == 1 || type.width*type.length == 128)) { 1180 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1181 } 1182 else { 1183 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1184 LLVMValueRef res; 1185 res = lp_build_iround(bld, a); 1186 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1187 return res; 1188 } 1189} 1190 1191 1192/** 1193 * Return floor of float (vector), result is a float (vector) 1194 * Ex: floor(1.1) = 1.0 1195 * Ex: floor(-1.1) = -2.0 1196 */ 1197LLVMValueRef 1198lp_build_floor(struct lp_build_context *bld, 1199 LLVMValueRef a) 1200{ 1201 LLVMBuilderRef builder = bld->gallivm->builder; 1202 const struct lp_type type = bld->type; 1203 1204 assert(type.floating); 1205 assert(lp_check_value(type, a)); 1206 1207 if (util_cpu_caps.has_sse4_1 && 1208 (type.length == 1 || type.width*type.length == 128)) { 1209 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1210 } 1211 else { 1212 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1213 LLVMValueRef res; 1214 res = lp_build_ifloor(bld, a); 1215 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1216 return res; 1217 } 1218} 1219 1220 1221/** 1222 * Return ceiling of float (vector), returning float (vector). 1223 * Ex: ceil( 1.1) = 2.0 1224 * Ex: ceil(-1.1) = -1.0 1225 */ 1226LLVMValueRef 1227lp_build_ceil(struct lp_build_context *bld, 1228 LLVMValueRef a) 1229{ 1230 LLVMBuilderRef builder = bld->gallivm->builder; 1231 const struct lp_type type = bld->type; 1232 1233 assert(type.floating); 1234 assert(lp_check_value(type, a)); 1235 1236 if (util_cpu_caps.has_sse4_1 && 1237 (type.length == 1 || type.width*type.length == 128)) { 1238 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1239 } 1240 else { 1241 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1242 LLVMValueRef res; 1243 res = lp_build_iceil(bld, a); 1244 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1245 return res; 1246 } 1247} 1248 1249 1250/** 1251 * Return fractional part of 'a' computed as a - floor(a) 1252 * Typically used in texture coord arithmetic. 1253 */ 1254LLVMValueRef 1255lp_build_fract(struct lp_build_context *bld, 1256 LLVMValueRef a) 1257{ 1258 assert(bld->type.floating); 1259 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1260} 1261 1262 1263/** 1264 * Return the integer part of a float (vector) value (== round toward zero). 1265 * The returned value is an integer (vector). 1266 * Ex: itrunc(-1.5) = -1 1267 */ 1268LLVMValueRef 1269lp_build_itrunc(struct lp_build_context *bld, 1270 LLVMValueRef a) 1271{ 1272 LLVMBuilderRef builder = bld->gallivm->builder; 1273 const struct lp_type type = bld->type; 1274 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1275 1276 assert(type.floating); 1277 assert(lp_check_value(type, a)); 1278 1279 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1280} 1281 1282 1283/** 1284 * Return float (vector) rounded to nearest integer (vector). The returned 1285 * value is an integer (vector). 1286 * Ex: iround(0.9) = 1 1287 * Ex: iround(-1.5) = -2 1288 */ 1289LLVMValueRef 1290lp_build_iround(struct lp_build_context *bld, 1291 LLVMValueRef a) 1292{ 1293 LLVMBuilderRef builder = bld->gallivm->builder; 1294 const struct lp_type type = bld->type; 1295 LLVMTypeRef int_vec_type = bld->int_vec_type; 1296 LLVMValueRef res; 1297 1298 assert(type.floating); 1299 1300 assert(lp_check_value(type, a)); 1301 1302 if (util_cpu_caps.has_sse2 && 1303 ((type.width == 32) && (type.length == 1 || type.length == 4))) { 1304 return lp_build_iround_nearest_sse2(bld, a); 1305 } 1306 else if (util_cpu_caps.has_sse4_1 && 1307 (type.length == 1 || type.width*type.length == 128)) { 1308 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1309 } 1310 else { 1311 LLVMValueRef half; 1312 1313 half = lp_build_const_vec(bld->gallivm, type, 0.5); 1314 1315 if (type.sign) { 1316 LLVMTypeRef vec_type = bld->vec_type; 1317 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1318 (unsigned long long)1 << (type.width - 1)); 1319 LLVMValueRef sign; 1320 1321 /* get sign bit */ 1322 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1323 sign = LLVMBuildAnd(builder, sign, mask, ""); 1324 1325 /* sign * 0.5 */ 1326 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 1327 half = LLVMBuildOr(builder, sign, half, ""); 1328 half = LLVMBuildBitCast(builder, half, vec_type, ""); 1329 } 1330 1331 res = LLVMBuildFAdd(builder, a, half, ""); 1332 } 1333 1334 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 1335 1336 return res; 1337} 1338 1339 1340/** 1341 * Return floor of float (vector), result is an int (vector) 1342 * Ex: ifloor(1.1) = 1.0 1343 * Ex: ifloor(-1.1) = -2.0 1344 */ 1345LLVMValueRef 1346lp_build_ifloor(struct lp_build_context *bld, 1347 LLVMValueRef a) 1348{ 1349 LLVMBuilderRef builder = bld->gallivm->builder; 1350 const struct lp_type type = bld->type; 1351 LLVMTypeRef int_vec_type = bld->int_vec_type; 1352 LLVMValueRef res; 1353 1354 assert(type.floating); 1355 assert(lp_check_value(type, a)); 1356 1357 if (util_cpu_caps.has_sse4_1 && 1358 (type.length == 1 || type.width*type.length == 128)) { 1359 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1360 } 1361 else { 1362 res = a; 1363 1364 if (type.sign) { 1365 /* Take the sign bit and add it to 1 constant */ 1366 LLVMTypeRef vec_type = bld->vec_type; 1367 unsigned mantissa = lp_mantissa(type); 1368 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1369 (unsigned long long)1 << (type.width - 1)); 1370 LLVMValueRef sign; 1371 LLVMValueRef offset; 1372 1373 /* sign = a < 0 ? ~0 : 0 */ 1374 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1375 sign = LLVMBuildAnd(builder, sign, mask, ""); 1376 sign = LLVMBuildAShr(builder, sign, 1377 lp_build_const_int_vec(bld->gallivm, type, 1378 type.width - 1), 1379 "ifloor.sign"); 1380 1381 /* offset = -0.99999(9)f */ 1382 offset = lp_build_const_vec(bld->gallivm, type, 1383 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1384 offset = LLVMConstBitCast(offset, int_vec_type); 1385 1386 /* offset = a < 0 ? offset : 0.0f */ 1387 offset = LLVMBuildAnd(builder, offset, sign, ""); 1388 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset"); 1389 1390 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res"); 1391 } 1392 } 1393 1394 /* round to nearest (toward zero) */ 1395 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 1396 1397 return res; 1398} 1399 1400 1401/** 1402 * Return ceiling of float (vector), returning int (vector). 1403 * Ex: iceil( 1.1) = 2 1404 * Ex: iceil(-1.1) = -1 1405 */ 1406LLVMValueRef 1407lp_build_iceil(struct lp_build_context *bld, 1408 LLVMValueRef a) 1409{ 1410 LLVMBuilderRef builder = bld->gallivm->builder; 1411 const struct lp_type type = bld->type; 1412 LLVMTypeRef int_vec_type = bld->int_vec_type; 1413 LLVMValueRef res; 1414 1415 assert(type.floating); 1416 assert(lp_check_value(type, a)); 1417 1418 if (util_cpu_caps.has_sse4_1 && 1419 (type.length == 1 || type.width*type.length == 128)) { 1420 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1421 } 1422 else { 1423 LLVMTypeRef vec_type = bld->vec_type; 1424 unsigned mantissa = lp_mantissa(type); 1425 LLVMValueRef offset; 1426 1427 /* offset = 0.99999(9)f */ 1428 offset = lp_build_const_vec(bld->gallivm, type, 1429 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1430 1431 if (type.sign) { 1432 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1433 (unsigned long long)1 << (type.width - 1)); 1434 LLVMValueRef sign; 1435 1436 /* sign = a < 0 ? 0 : ~0 */ 1437 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1438 sign = LLVMBuildAnd(builder, sign, mask, ""); 1439 sign = LLVMBuildAShr(builder, sign, 1440 lp_build_const_int_vec(bld->gallivm, type, 1441 type.width - 1), 1442 "iceil.sign"); 1443 sign = LLVMBuildNot(builder, sign, "iceil.not"); 1444 1445 /* offset = a < 0 ? 0.0 : offset */ 1446 offset = LLVMConstBitCast(offset, int_vec_type); 1447 offset = LLVMBuildAnd(builder, offset, sign, ""); 1448 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset"); 1449 } 1450 1451 res = LLVMBuildFAdd(builder, a, offset, "iceil.res"); 1452 } 1453 1454 /* round to nearest (toward zero) */ 1455 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 1456 1457 return res; 1458} 1459 1460 1461/** 1462 * Combined ifloor() & fract(). 1463 * 1464 * Preferred to calling the functions separately, as it will ensure that the 1465 * stratergy (floor() vs ifloor()) that results in less redundant work is used. 1466 */ 1467void 1468lp_build_ifloor_fract(struct lp_build_context *bld, 1469 LLVMValueRef a, 1470 LLVMValueRef *out_ipart, 1471 LLVMValueRef *out_fpart) 1472{ 1473 LLVMBuilderRef builder = bld->gallivm->builder; 1474 const struct lp_type type = bld->type; 1475 LLVMValueRef ipart; 1476 1477 assert(type.floating); 1478 assert(lp_check_value(type, a)); 1479 1480 if (util_cpu_caps.has_sse4_1 && 1481 (type.length == 1 || type.width*type.length == 128)) { 1482 /* 1483 * floor() is easier. 1484 */ 1485 1486 ipart = lp_build_floor(bld, a); 1487 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1488 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 1489 } 1490 else { 1491 /* 1492 * ifloor() is easier. 1493 */ 1494 1495 *out_ipart = lp_build_ifloor(bld, a); 1496 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 1497 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1498 } 1499} 1500 1501 1502LLVMValueRef 1503lp_build_sqrt(struct lp_build_context *bld, 1504 LLVMValueRef a) 1505{ 1506 LLVMBuilderRef builder = bld->gallivm->builder; 1507 const struct lp_type type = bld->type; 1508 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1509 char intrinsic[32]; 1510 1511 assert(lp_check_value(type, a)); 1512 1513 /* TODO: optimize the constant case */ 1514 /* TODO: optimize the constant case */ 1515 1516 assert(type.floating); 1517 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1518 1519 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1520} 1521 1522 1523/** 1524 * Do one Newton-Raphson step to improve reciprocate precision: 1525 * 1526 * x_{i+1} = x_i * (2 - a * x_i) 1527 * 1528 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 1529 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 1530 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 1531 * halo. It would be necessary to clamp the argument to prevent this. 1532 * 1533 * See also: 1534 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 1535 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1536 */ 1537static INLINE LLVMValueRef 1538lp_build_rcp_refine(struct lp_build_context *bld, 1539 LLVMValueRef a, 1540 LLVMValueRef rcp_a) 1541{ 1542 LLVMBuilderRef builder = bld->gallivm->builder; 1543 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 1544 LLVMValueRef res; 1545 1546 res = LLVMBuildFMul(builder, a, rcp_a, ""); 1547 res = LLVMBuildFSub(builder, two, res, ""); 1548 res = LLVMBuildFMul(builder, rcp_a, res, ""); 1549 1550 return res; 1551} 1552 1553 1554LLVMValueRef 1555lp_build_rcp(struct lp_build_context *bld, 1556 LLVMValueRef a) 1557{ 1558 LLVMBuilderRef builder = bld->gallivm->builder; 1559 const struct lp_type type = bld->type; 1560 1561 assert(lp_check_value(type, a)); 1562 1563 if(a == bld->zero) 1564 return bld->undef; 1565 if(a == bld->one) 1566 return bld->one; 1567 if(a == bld->undef) 1568 return bld->undef; 1569 1570 assert(type.floating); 1571 1572 if(LLVMIsConstant(a)) 1573 return LLVMConstFDiv(bld->one, a); 1574 1575 /* 1576 * We don't use RCPPS because: 1577 * - it only has 10bits of precision 1578 * - it doesn't even get the reciprocate of 1.0 exactly 1579 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 1580 * - for recent processors the benefit over DIVPS is marginal, a case 1581 * depedent 1582 * 1583 * We could still use it on certain processors if benchmarks show that the 1584 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 1585 * particular uses that require less workarounds. 1586 */ 1587 1588 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1589 const unsigned num_iterations = 0; 1590 LLVMValueRef res; 1591 unsigned i; 1592 1593 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); 1594 1595 for (i = 0; i < num_iterations; ++i) { 1596 res = lp_build_rcp_refine(bld, a, res); 1597 } 1598 1599 return res; 1600 } 1601 1602 return LLVMBuildFDiv(builder, bld->one, a, ""); 1603} 1604 1605 1606/** 1607 * Do one Newton-Raphson step to improve rsqrt precision: 1608 * 1609 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 1610 * 1611 * See also: 1612 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1613 */ 1614static INLINE LLVMValueRef 1615lp_build_rsqrt_refine(struct lp_build_context *bld, 1616 LLVMValueRef a, 1617 LLVMValueRef rsqrt_a) 1618{ 1619 LLVMBuilderRef builder = bld->gallivm->builder; 1620 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 1621 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 1622 LLVMValueRef res; 1623 1624 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 1625 res = LLVMBuildFMul(builder, a, res, ""); 1626 res = LLVMBuildFSub(builder, three, res, ""); 1627 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 1628 res = LLVMBuildFMul(builder, half, res, ""); 1629 1630 return res; 1631} 1632 1633 1634/** 1635 * Generate 1/sqrt(a) 1636 */ 1637LLVMValueRef 1638lp_build_rsqrt(struct lp_build_context *bld, 1639 LLVMValueRef a) 1640{ 1641 LLVMBuilderRef builder = bld->gallivm->builder; 1642 const struct lp_type type = bld->type; 1643 1644 assert(lp_check_value(type, a)); 1645 1646 assert(type.floating); 1647 1648 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1649 const unsigned num_iterations = 1; 1650 LLVMValueRef res; 1651 unsigned i; 1652 1653 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); 1654 1655 for (i = 0; i < num_iterations; ++i) { 1656 res = lp_build_rsqrt_refine(bld, a, res); 1657 } 1658 1659 return res; 1660 } 1661 1662 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1663} 1664 1665 1666/** 1667 * Generate sin(a) using SSE2 1668 */ 1669LLVMValueRef 1670lp_build_sin(struct lp_build_context *bld, 1671 LLVMValueRef a) 1672{ 1673 struct gallivm_state *gallivm = bld->gallivm; 1674 LLVMBuilderRef builder = gallivm->builder; 1675 struct lp_type int_type = lp_int_type(bld->type); 1676 LLVMBuilderRef b = builder; 1677 1678 /* 1679 * take the absolute value, 1680 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1681 */ 1682 1683 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 1684 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 1685 1686 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1687 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 1688 1689 /* 1690 * extract the sign bit (upper one) 1691 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 1692 */ 1693 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 1694 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 1695 1696 /* 1697 * scale by 4/Pi 1698 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1699 */ 1700 1701 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 1702 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1703 1704 /* 1705 * store the integer part of y in mm0 1706 * emm2 = _mm_cvttps_epi32(y); 1707 */ 1708 1709 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 1710 1711 /* 1712 * j=(j+1) & (~1) (see the cephes sources) 1713 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1714 */ 1715 1716 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 1717 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1718 /* 1719 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1720 */ 1721 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 1722 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1723 1724 /* 1725 * y = _mm_cvtepi32_ps(emm2); 1726 */ 1727 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 1728 1729 /* get the swap sign flag 1730 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 1731 */ 1732 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 1733 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 1734 1735 /* 1736 * emm2 = _mm_slli_epi32(emm0, 29); 1737 */ 1738 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 1739 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 1740 1741 /* 1742 * get the polynom selection mask 1743 * there is one polynom for 0 <= x <= Pi/4 1744 * and another one for Pi/4<x<=Pi/2 1745 * Both branches will be computed. 1746 * 1747 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1748 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1749 */ 1750 1751 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1752 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 1753 LLVMValueRef poly_mask = lp_build_compare(gallivm, 1754 int_type, PIPE_FUNC_EQUAL, 1755 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 1756 /* 1757 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 1758 */ 1759 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 1760 1761 /* 1762 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1763 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1764 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1765 */ 1766 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 1767 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 1768 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 1769 1770 /* 1771 * The magic pass: "Extended precision modular arithmetic" 1772 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1773 * xmm1 = _mm_mul_ps(y, xmm1); 1774 * xmm2 = _mm_mul_ps(y, xmm2); 1775 * xmm3 = _mm_mul_ps(y, xmm3); 1776 */ 1777 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1778 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1779 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1780 1781 /* 1782 * x = _mm_add_ps(x, xmm1); 1783 * x = _mm_add_ps(x, xmm2); 1784 * x = _mm_add_ps(x, xmm3); 1785 */ 1786 1787 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 1788 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 1789 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 1790 1791 /* 1792 * Evaluate the first polynom (0 <= x <= Pi/4) 1793 * 1794 * z = _mm_mul_ps(x,x); 1795 */ 1796 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 1797 1798 /* 1799 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1800 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1801 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1802 */ 1803 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 1804 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 1805 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 1806 1807 /* 1808 * y = *(v4sf*)_ps_coscof_p0; 1809 * y = _mm_mul_ps(y, z); 1810 */ 1811 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 1812 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 1813 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 1814 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 1815 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 1816 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 1817 1818 1819 /* 1820 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1821 * y = _mm_sub_ps(y, tmp); 1822 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1823 */ 1824 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 1825 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 1826 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 1827 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 1828 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 1829 1830 /* 1831 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1832 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1833 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1834 */ 1835 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 1836 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 1837 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 1838 1839 /* 1840 * Evaluate the second polynom (Pi/4 <= x <= 0) 1841 * 1842 * y2 = *(v4sf*)_ps_sincof_p0; 1843 * y2 = _mm_mul_ps(y2, z); 1844 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1845 * y2 = _mm_mul_ps(y2, z); 1846 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1847 * y2 = _mm_mul_ps(y2, z); 1848 * y2 = _mm_mul_ps(y2, x); 1849 * y2 = _mm_add_ps(y2, x); 1850 */ 1851 1852 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 1853 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 1854 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 1855 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 1856 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 1857 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 1858 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 1859 1860 /* 1861 * select the correct result from the two polynoms 1862 * xmm3 = poly_mask; 1863 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1864 * y = _mm_andnot_ps(xmm3, y); 1865 * y = _mm_add_ps(y,y2); 1866 */ 1867 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 1868 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 1869 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1870 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 1871 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1872 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1873 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1874 1875 /* 1876 * update the sign 1877 * y = _mm_xor_ps(y, sign_bit); 1878 */ 1879 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 1880 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 1881 return y_result; 1882} 1883 1884 1885/** 1886 * Generate cos(a) using SSE2 1887 */ 1888LLVMValueRef 1889lp_build_cos(struct lp_build_context *bld, 1890 LLVMValueRef a) 1891{ 1892 struct gallivm_state *gallivm = bld->gallivm; 1893 LLVMBuilderRef builder = gallivm->builder; 1894 struct lp_type int_type = lp_int_type(bld->type); 1895 LLVMBuilderRef b = builder; 1896 1897 /* 1898 * take the absolute value, 1899 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1900 */ 1901 1902 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 1903 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 1904 1905 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1906 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 1907 1908 /* 1909 * scale by 4/Pi 1910 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1911 */ 1912 1913 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 1914 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1915 1916 /* 1917 * store the integer part of y in mm0 1918 * emm2 = _mm_cvttps_epi32(y); 1919 */ 1920 1921 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 1922 1923 /* 1924 * j=(j+1) & (~1) (see the cephes sources) 1925 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1926 */ 1927 1928 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 1929 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1930 /* 1931 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1932 */ 1933 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 1934 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1935 1936 /* 1937 * y = _mm_cvtepi32_ps(emm2); 1938 */ 1939 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 1940 1941 1942 /* 1943 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 1944 */ 1945 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1946 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 1947 1948 1949 /* get the swap sign flag 1950 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 1951 */ 1952 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 1953 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 1954 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 1955 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 1956 1957 /* 1958 * emm2 = _mm_slli_epi32(emm0, 29); 1959 */ 1960 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 1961 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 1962 1963 /* 1964 * get the polynom selection mask 1965 * there is one polynom for 0 <= x <= Pi/4 1966 * and another one for Pi/4<x<=Pi/2 1967 * Both branches will be computed. 1968 * 1969 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1970 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1971 */ 1972 1973 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1974 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 1975 LLVMValueRef poly_mask = lp_build_compare(gallivm, 1976 int_type, PIPE_FUNC_EQUAL, 1977 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 1978 1979 /* 1980 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1981 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1982 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1983 */ 1984 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 1985 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 1986 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 1987 1988 /* 1989 * The magic pass: "Extended precision modular arithmetic" 1990 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1991 * xmm1 = _mm_mul_ps(y, xmm1); 1992 * xmm2 = _mm_mul_ps(y, xmm2); 1993 * xmm3 = _mm_mul_ps(y, xmm3); 1994 */ 1995 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1996 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1997 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1998 1999 /* 2000 * x = _mm_add_ps(x, xmm1); 2001 * x = _mm_add_ps(x, xmm2); 2002 * x = _mm_add_ps(x, xmm3); 2003 */ 2004 2005 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2006 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2007 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2008 2009 /* 2010 * Evaluate the first polynom (0 <= x <= Pi/4) 2011 * 2012 * z = _mm_mul_ps(x,x); 2013 */ 2014 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2015 2016 /* 2017 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2018 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2019 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2020 */ 2021 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2022 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2023 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2024 2025 /* 2026 * y = *(v4sf*)_ps_coscof_p0; 2027 * y = _mm_mul_ps(y, z); 2028 */ 2029 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2030 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2031 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2032 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2033 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2034 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2035 2036 2037 /* 2038 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2039 * y = _mm_sub_ps(y, tmp); 2040 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2041 */ 2042 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2043 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2044 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2045 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2046 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2047 2048 /* 2049 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2050 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2051 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2052 */ 2053 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2054 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2055 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2056 2057 /* 2058 * Evaluate the second polynom (Pi/4 <= x <= 0) 2059 * 2060 * y2 = *(v4sf*)_ps_sincof_p0; 2061 * y2 = _mm_mul_ps(y2, z); 2062 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2063 * y2 = _mm_mul_ps(y2, z); 2064 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2065 * y2 = _mm_mul_ps(y2, z); 2066 * y2 = _mm_mul_ps(y2, x); 2067 * y2 = _mm_add_ps(y2, x); 2068 */ 2069 2070 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2071 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2072 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2073 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2074 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2075 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2076 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2077 2078 /* 2079 * select the correct result from the two polynoms 2080 * xmm3 = poly_mask; 2081 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2082 * y = _mm_andnot_ps(xmm3, y); 2083 * y = _mm_add_ps(y,y2); 2084 */ 2085 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2086 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2087 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2088 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2089 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2090 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2091 2092 /* 2093 * update the sign 2094 * y = _mm_xor_ps(y, sign_bit); 2095 */ 2096 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 2097 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2098 return y_result; 2099} 2100 2101 2102/** 2103 * Generate pow(x, y) 2104 */ 2105LLVMValueRef 2106lp_build_pow(struct lp_build_context *bld, 2107 LLVMValueRef x, 2108 LLVMValueRef y) 2109{ 2110 /* TODO: optimize the constant case */ 2111 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2112 LLVMIsConstant(x) && LLVMIsConstant(y)) { 2113 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2114 __FUNCTION__); 2115 } 2116 2117 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 2118} 2119 2120 2121/** 2122 * Generate exp(x) 2123 */ 2124LLVMValueRef 2125lp_build_exp(struct lp_build_context *bld, 2126 LLVMValueRef x) 2127{ 2128 /* log2(e) = 1/log(2) */ 2129 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 2130 1.4426950408889634); 2131 2132 assert(lp_check_value(bld->type, x)); 2133 2134 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 2135} 2136 2137 2138/** 2139 * Generate log(x) 2140 */ 2141LLVMValueRef 2142lp_build_log(struct lp_build_context *bld, 2143 LLVMValueRef x) 2144{ 2145 /* log(2) */ 2146 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 2147 0.69314718055994529); 2148 2149 assert(lp_check_value(bld->type, x)); 2150 2151 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 2152} 2153 2154 2155/** 2156 * Generate polynomial. 2157 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 2158 */ 2159static LLVMValueRef 2160lp_build_polynomial(struct lp_build_context *bld, 2161 LLVMValueRef x, 2162 const double *coeffs, 2163 unsigned num_coeffs) 2164{ 2165 const struct lp_type type = bld->type; 2166 LLVMValueRef res = NULL; 2167 unsigned i; 2168 2169 assert(lp_check_value(bld->type, x)); 2170 2171 /* TODO: optimize the constant case */ 2172 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2173 LLVMIsConstant(x)) { 2174 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2175 __FUNCTION__); 2176 } 2177 2178 for (i = num_coeffs; i--; ) { 2179 LLVMValueRef coeff; 2180 2181 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 2182 2183 if(res) 2184 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); 2185 else 2186 res = coeff; 2187 } 2188 2189 if(res) 2190 return res; 2191 else 2192 return bld->undef; 2193} 2194 2195 2196/** 2197 * Minimax polynomial fit of 2**x, in range [0, 1[ 2198 */ 2199const double lp_build_exp2_polynomial[] = { 2200#if EXP_POLY_DEGREE == 5 2201 0.999999925063526176901, 2202 0.693153073200168932794, 2203 0.240153617044375388211, 2204 0.0558263180532956664775, 2205 0.00898934009049466391101, 2206 0.00187757667519147912699 2207#elif EXP_POLY_DEGREE == 4 2208 1.00000259337069434683, 2209 0.693003834469974940458, 2210 0.24144275689150793076, 2211 0.0520114606103070150235, 2212 0.0135341679161270268764 2213#elif EXP_POLY_DEGREE == 3 2214 0.999925218562710312959, 2215 0.695833540494823811697, 2216 0.226067155427249155588, 2217 0.0780245226406372992967 2218#elif EXP_POLY_DEGREE == 2 2219 1.00172476321474503578, 2220 0.657636275736077639316, 2221 0.33718943461968720704 2222#else 2223#error 2224#endif 2225}; 2226 2227 2228void 2229lp_build_exp2_approx(struct lp_build_context *bld, 2230 LLVMValueRef x, 2231 LLVMValueRef *p_exp2_int_part, 2232 LLVMValueRef *p_frac_part, 2233 LLVMValueRef *p_exp2) 2234{ 2235 LLVMBuilderRef builder = bld->gallivm->builder; 2236 const struct lp_type type = bld->type; 2237 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2238 LLVMValueRef ipart = NULL; 2239 LLVMValueRef fpart = NULL; 2240 LLVMValueRef expipart = NULL; 2241 LLVMValueRef expfpart = NULL; 2242 LLVMValueRef res = NULL; 2243 2244 assert(lp_check_value(bld->type, x)); 2245 2246 if(p_exp2_int_part || p_frac_part || p_exp2) { 2247 /* TODO: optimize the constant case */ 2248 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2249 LLVMIsConstant(x)) { 2250 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2251 __FUNCTION__); 2252 } 2253 2254 assert(type.floating && type.width == 32); 2255 2256 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0)); 2257 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999)); 2258 2259 /* ipart = floor(x) */ 2260 /* fpart = x - ipart */ 2261 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 2262 } 2263 2264 if(p_exp2_int_part || p_exp2) { 2265 /* expipart = (float) (1 << ipart) */ 2266 expipart = LLVMBuildAdd(builder, ipart, 2267 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2268 expipart = LLVMBuildShl(builder, expipart, 2269 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2270 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 2271 } 2272 2273 if(p_exp2) { 2274 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 2275 Elements(lp_build_exp2_polynomial)); 2276 2277 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 2278 } 2279 2280 if(p_exp2_int_part) 2281 *p_exp2_int_part = expipart; 2282 2283 if(p_frac_part) 2284 *p_frac_part = fpart; 2285 2286 if(p_exp2) 2287 *p_exp2 = res; 2288} 2289 2290 2291LLVMValueRef 2292lp_build_exp2(struct lp_build_context *bld, 2293 LLVMValueRef x) 2294{ 2295 LLVMValueRef res; 2296 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 2297 return res; 2298} 2299 2300 2301/** 2302 * Extract the exponent of a IEEE-754 floating point value. 2303 * 2304 * Optionally apply an integer bias. 2305 * 2306 * Result is an integer value with 2307 * 2308 * ifloor(log2(x)) + bias 2309 */ 2310LLVMValueRef 2311lp_build_extract_exponent(struct lp_build_context *bld, 2312 LLVMValueRef x, 2313 int bias) 2314{ 2315 LLVMBuilderRef builder = bld->gallivm->builder; 2316 const struct lp_type type = bld->type; 2317 unsigned mantissa = lp_mantissa(type); 2318 LLVMValueRef res; 2319 2320 assert(type.floating); 2321 2322 assert(lp_check_value(bld->type, x)); 2323 2324 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2325 2326 res = LLVMBuildLShr(builder, x, 2327 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 2328 res = LLVMBuildAnd(builder, res, 2329 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 2330 res = LLVMBuildSub(builder, res, 2331 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 2332 2333 return res; 2334} 2335 2336 2337/** 2338 * Extract the mantissa of the a floating. 2339 * 2340 * Result is a floating point value with 2341 * 2342 * x / floor(log2(x)) 2343 */ 2344LLVMValueRef 2345lp_build_extract_mantissa(struct lp_build_context *bld, 2346 LLVMValueRef x) 2347{ 2348 LLVMBuilderRef builder = bld->gallivm->builder; 2349 const struct lp_type type = bld->type; 2350 unsigned mantissa = lp_mantissa(type); 2351 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 2352 (1ULL << mantissa) - 1); 2353 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 2354 LLVMValueRef res; 2355 2356 assert(lp_check_value(bld->type, x)); 2357 2358 assert(type.floating); 2359 2360 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2361 2362 /* res = x / 2**ipart */ 2363 res = LLVMBuildAnd(builder, x, mantmask, ""); 2364 res = LLVMBuildOr(builder, res, one, ""); 2365 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 2366 2367 return res; 2368} 2369 2370 2371 2372/** 2373 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 2374 * These coefficients can be generate with 2375 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 2376 */ 2377const double lp_build_log2_polynomial[] = { 2378#if LOG_POLY_DEGREE == 6 2379 3.11578814719469302614, 2380 -3.32419399085241980044, 2381 2.59883907202499966007, 2382 -1.23152682416275988241, 2383 0.318212422185251071475, 2384 -0.0344359067839062357313 2385#elif LOG_POLY_DEGREE == 5 2386 2.8882704548164776201, 2387 -2.52074962577807006663, 2388 1.48116647521213171641, 2389 -0.465725644288844778798, 2390 0.0596515482674574969533 2391#elif LOG_POLY_DEGREE == 4 2392 2.61761038894603480148, 2393 -1.75647175389045657003, 2394 0.688243882994381274313, 2395 -0.107254423828329604454 2396#elif LOG_POLY_DEGREE == 3 2397 2.28330284476918490682, 2398 -1.04913055217340124191, 2399 0.204446009836232697516 2400#else 2401#error 2402#endif 2403}; 2404 2405 2406/** 2407 * See http://www.devmaster.net/forums/showthread.php?p=43580 2408 */ 2409void 2410lp_build_log2_approx(struct lp_build_context *bld, 2411 LLVMValueRef x, 2412 LLVMValueRef *p_exp, 2413 LLVMValueRef *p_floor_log2, 2414 LLVMValueRef *p_log2) 2415{ 2416 LLVMBuilderRef builder = bld->gallivm->builder; 2417 const struct lp_type type = bld->type; 2418 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2419 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2420 2421 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 2422 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 2423 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 2424 2425 LLVMValueRef i = NULL; 2426 LLVMValueRef exp = NULL; 2427 LLVMValueRef mant = NULL; 2428 LLVMValueRef logexp = NULL; 2429 LLVMValueRef logmant = NULL; 2430 LLVMValueRef res = NULL; 2431 2432 assert(lp_check_value(bld->type, x)); 2433 2434 if(p_exp || p_floor_log2 || p_log2) { 2435 /* TODO: optimize the constant case */ 2436 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2437 LLVMIsConstant(x)) { 2438 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2439 __FUNCTION__); 2440 } 2441 2442 assert(type.floating && type.width == 32); 2443 2444 /* 2445 * We don't explicitly handle denormalized numbers. They will yield a 2446 * result in the neighbourhood of -127, which appears to be adequate 2447 * enough. 2448 */ 2449 2450 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 2451 2452 /* exp = (float) exponent(x) */ 2453 exp = LLVMBuildAnd(builder, i, expmask, ""); 2454 } 2455 2456 if(p_floor_log2 || p_log2) { 2457 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2458 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2459 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 2460 } 2461 2462 if(p_log2) { 2463 /* mant = (float) mantissa(x) */ 2464 mant = LLVMBuildAnd(builder, i, mantmask, ""); 2465 mant = LLVMBuildOr(builder, mant, one, ""); 2466 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 2467 2468 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, 2469 Elements(lp_build_log2_polynomial)); 2470 2471 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ 2472 logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), ""); 2473 2474 res = LLVMBuildFAdd(builder, logmant, logexp, ""); 2475 } 2476 2477 if(p_exp) { 2478 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 2479 *p_exp = exp; 2480 } 2481 2482 if(p_floor_log2) 2483 *p_floor_log2 = logexp; 2484 2485 if(p_log2) 2486 *p_log2 = res; 2487} 2488 2489 2490LLVMValueRef 2491lp_build_log2(struct lp_build_context *bld, 2492 LLVMValueRef x) 2493{ 2494 LLVMValueRef res; 2495 lp_build_log2_approx(bld, x, NULL, NULL, &res); 2496 return res; 2497} 2498 2499 2500/** 2501 * Faster (and less accurate) log2. 2502 * 2503 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 2504 * 2505 * Piece-wise linear approximation, with exact results when x is a 2506 * power of two. 2507 * 2508 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 2509 */ 2510LLVMValueRef 2511lp_build_fast_log2(struct lp_build_context *bld, 2512 LLVMValueRef x) 2513{ 2514 LLVMBuilderRef builder = bld->gallivm->builder; 2515 LLVMValueRef ipart; 2516 LLVMValueRef fpart; 2517 2518 assert(lp_check_value(bld->type, x)); 2519 2520 assert(bld->type.floating); 2521 2522 /* ipart = floor(log2(x)) - 1 */ 2523 ipart = lp_build_extract_exponent(bld, x, -1); 2524 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 2525 2526 /* fpart = x / 2**ipart */ 2527 fpart = lp_build_extract_mantissa(bld, x); 2528 2529 /* ipart + fpart */ 2530 return LLVMBuildFAdd(builder, ipart, fpart, ""); 2531} 2532 2533 2534/** 2535 * Fast implementation of iround(log2(x)). 2536 * 2537 * Not an approximation -- it should give accurate results all the time. 2538 */ 2539LLVMValueRef 2540lp_build_ilog2(struct lp_build_context *bld, 2541 LLVMValueRef x) 2542{ 2543 LLVMBuilderRef builder = bld->gallivm->builder; 2544 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 2545 LLVMValueRef ipart; 2546 2547 assert(bld->type.floating); 2548 2549 assert(lp_check_value(bld->type, x)); 2550 2551 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 2552 x = LLVMBuildFMul(builder, x, sqrt2, ""); 2553 2554 /* ipart = floor(log2(x) + 0.5) */ 2555 ipart = lp_build_extract_exponent(bld, x, 0); 2556 2557 return ipart; 2558} 2559 2560LLVMValueRef 2561lp_build_mod(struct lp_build_context *bld, 2562 LLVMValueRef x, 2563 LLVMValueRef y) 2564{ 2565 LLVMBuilderRef builder = bld->gallivm->builder; 2566 LLVMValueRef res; 2567 const struct lp_type type = bld->type; 2568 2569 assert(lp_check_value(type, x)); 2570 assert(lp_check_value(type, y)); 2571 2572 if (type.floating) 2573 res = LLVMBuildFRem(builder, x, y, ""); 2574 else if (type.sign) 2575 res = LLVMBuildSRem(builder, x, y, ""); 2576 else 2577 res = LLVMBuildURem(builder, x, y, ""); 2578 return res; 2579} 2580