lp_bld_arit.c revision a3d4af0c00a198bba00c0965131a14223d426dd4
1/************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include "util/u_memory.h" 49#include "util/u_debug.h" 50#include "util/u_math.h" 51#include "util/u_string.h" 52#include "util/u_cpu_detect.h" 53 54#include "lp_bld_type.h" 55#include "lp_bld_const.h" 56#include "lp_bld_init.h" 57#include "lp_bld_intr.h" 58#include "lp_bld_logic.h" 59#include "lp_bld_pack.h" 60#include "lp_bld_debug.h" 61#include "lp_bld_arit.h" 62 63 64#define EXP_POLY_DEGREE 5 65 66#define LOG_POLY_DEGREE 4 67 68 69/** 70 * Generate min(a, b) 71 * No checks for special case values of a or b = 1 or 0 are done. 72 */ 73static LLVMValueRef 74lp_build_min_simple(struct lp_build_context *bld, 75 LLVMValueRef a, 76 LLVMValueRef b) 77{ 78 LLVMBuilderRef builder = bld->gallivm->builder; 79 const struct lp_type type = bld->type; 80 const char *intrinsic = NULL; 81 LLVMValueRef cond; 82 83 assert(lp_check_value(type, a)); 84 assert(lp_check_value(type, b)); 85 86 /* TODO: optimize the constant case */ 87 88 if(type.width * type.length == 128) { 89 if(type.floating) { 90 if(type.width == 32 && util_cpu_caps.has_sse) 91 intrinsic = "llvm.x86.sse.min.ps"; 92 if(type.width == 64 && util_cpu_caps.has_sse2) 93 intrinsic = "llvm.x86.sse2.min.pd"; 94 } 95 else { 96 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 97 intrinsic = "llvm.x86.sse2.pminu.b"; 98 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 99 intrinsic = "llvm.x86.sse41.pminsb"; 100 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 101 intrinsic = "llvm.x86.sse41.pminuw"; 102 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 103 intrinsic = "llvm.x86.sse2.pmins.w"; 104 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 105 intrinsic = "llvm.x86.sse41.pminud"; 106 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 107 intrinsic = "llvm.x86.sse41.pminsd"; 108 } 109 } 110 111 if(intrinsic) 112 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 113 114 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 115 return lp_build_select(bld, cond, a, b); 116} 117 118 119/** 120 * Generate max(a, b) 121 * No checks for special case values of a or b = 1 or 0 are done. 122 */ 123static LLVMValueRef 124lp_build_max_simple(struct lp_build_context *bld, 125 LLVMValueRef a, 126 LLVMValueRef b) 127{ 128 LLVMBuilderRef builder = bld->gallivm->builder; 129 const struct lp_type type = bld->type; 130 const char *intrinsic = NULL; 131 LLVMValueRef cond; 132 133 assert(lp_check_value(type, a)); 134 assert(lp_check_value(type, b)); 135 136 /* TODO: optimize the constant case */ 137 138 if(type.width * type.length == 128) { 139 if(type.floating) { 140 if(type.width == 32 && util_cpu_caps.has_sse) 141 intrinsic = "llvm.x86.sse.max.ps"; 142 if(type.width == 64 && util_cpu_caps.has_sse2) 143 intrinsic = "llvm.x86.sse2.max.pd"; 144 } 145 else { 146 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 147 intrinsic = "llvm.x86.sse2.pmaxu.b"; 148 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 149 intrinsic = "llvm.x86.sse41.pmaxsb"; 150 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 151 intrinsic = "llvm.x86.sse41.pmaxuw"; 152 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 153 intrinsic = "llvm.x86.sse2.pmaxs.w"; 154 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 155 intrinsic = "llvm.x86.sse41.pmaxud"; 156 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 157 intrinsic = "llvm.x86.sse41.pmaxsd"; 158 } 159 } 160 161 if(intrinsic) 162 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 163 164 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 165 return lp_build_select(bld, cond, a, b); 166} 167 168 169/** 170 * Generate 1 - a, or ~a depending on bld->type. 171 */ 172LLVMValueRef 173lp_build_comp(struct lp_build_context *bld, 174 LLVMValueRef a) 175{ 176 LLVMBuilderRef builder = bld->gallivm->builder; 177 const struct lp_type type = bld->type; 178 179 assert(lp_check_value(type, a)); 180 181 if(a == bld->one) 182 return bld->zero; 183 if(a == bld->zero) 184 return bld->one; 185 186 if(type.norm && !type.floating && !type.fixed && !type.sign) { 187 if(LLVMIsConstant(a)) 188 return LLVMConstNot(a); 189 else 190 return LLVMBuildNot(builder, a, ""); 191 } 192 193 if(LLVMIsConstant(a)) 194 if (type.floating) 195 return LLVMConstFSub(bld->one, a); 196 else 197 return LLVMConstSub(bld->one, a); 198 else 199 if (type.floating) 200 return LLVMBuildFSub(builder, bld->one, a, ""); 201 else 202 return LLVMBuildSub(builder, bld->one, a, ""); 203} 204 205 206/** 207 * Generate a + b 208 */ 209LLVMValueRef 210lp_build_add(struct lp_build_context *bld, 211 LLVMValueRef a, 212 LLVMValueRef b) 213{ 214 LLVMBuilderRef builder = bld->gallivm->builder; 215 const struct lp_type type = bld->type; 216 LLVMValueRef res; 217 218 assert(lp_check_value(type, a)); 219 assert(lp_check_value(type, b)); 220 221 if(a == bld->zero) 222 return b; 223 if(b == bld->zero) 224 return a; 225 if(a == bld->undef || b == bld->undef) 226 return bld->undef; 227 228 if(bld->type.norm) { 229 const char *intrinsic = NULL; 230 231 if(a == bld->one || b == bld->one) 232 return bld->one; 233 234 if(util_cpu_caps.has_sse2 && 235 type.width * type.length == 128 && 236 !type.floating && !type.fixed) { 237 if(type.width == 8) 238 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 239 if(type.width == 16) 240 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 241 } 242 243 if(intrinsic) 244 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 245 } 246 247 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 248 if (type.floating) 249 res = LLVMConstFAdd(a, b); 250 else 251 res = LLVMConstAdd(a, b); 252 else 253 if (type.floating) 254 res = LLVMBuildFAdd(builder, a, b, ""); 255 else 256 res = LLVMBuildAdd(builder, a, b, ""); 257 258 /* clamp to ceiling of 1.0 */ 259 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 260 res = lp_build_min_simple(bld, res, bld->one); 261 262 /* XXX clamp to floor of -1 or 0??? */ 263 264 return res; 265} 266 267 268/** Return the scalar sum of the elements of a */ 269LLVMValueRef 270lp_build_sum_vector(struct lp_build_context *bld, 271 LLVMValueRef a) 272{ 273 LLVMBuilderRef builder = bld->gallivm->builder; 274 const struct lp_type type = bld->type; 275 LLVMValueRef index, res; 276 unsigned i; 277 278 assert(lp_check_value(type, a)); 279 280 if (type.length == 1) { 281 return a; 282 } 283 284 assert(!bld->type.norm); 285 286 index = lp_build_const_int32(bld->gallivm, 0); 287 res = LLVMBuildExtractElement(builder, a, index, ""); 288 289 for (i = 1; i < type.length; i++) { 290 index = lp_build_const_int32(bld->gallivm, i); 291 if (type.floating) 292 res = LLVMBuildFAdd(builder, res, 293 LLVMBuildExtractElement(builder, 294 a, index, ""), 295 ""); 296 else 297 res = LLVMBuildAdd(builder, res, 298 LLVMBuildExtractElement(builder, 299 a, index, ""), 300 ""); 301 } 302 303 return res; 304} 305 306 307/** 308 * Generate a - b 309 */ 310LLVMValueRef 311lp_build_sub(struct lp_build_context *bld, 312 LLVMValueRef a, 313 LLVMValueRef b) 314{ 315 LLVMBuilderRef builder = bld->gallivm->builder; 316 const struct lp_type type = bld->type; 317 LLVMValueRef res; 318 319 assert(lp_check_value(type, a)); 320 assert(lp_check_value(type, b)); 321 322 if(b == bld->zero) 323 return a; 324 if(a == bld->undef || b == bld->undef) 325 return bld->undef; 326 if(a == b) 327 return bld->zero; 328 329 if(bld->type.norm) { 330 const char *intrinsic = NULL; 331 332 if(b == bld->one) 333 return bld->zero; 334 335 if(util_cpu_caps.has_sse2 && 336 type.width * type.length == 128 && 337 !type.floating && !type.fixed) { 338 if(type.width == 8) 339 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 340 if(type.width == 16) 341 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 342 } 343 344 if(intrinsic) 345 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 346 } 347 348 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 349 if (type.floating) 350 res = LLVMConstFSub(a, b); 351 else 352 res = LLVMConstSub(a, b); 353 else 354 if (type.floating) 355 res = LLVMBuildFSub(builder, a, b, ""); 356 else 357 res = LLVMBuildSub(builder, a, b, ""); 358 359 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 360 res = lp_build_max_simple(bld, res, bld->zero); 361 362 return res; 363} 364 365 366/** 367 * Normalized 8bit multiplication. 368 * 369 * - alpha plus one 370 * 371 * makes the following approximation to the division (Sree) 372 * 373 * a*b/255 ~= (a*(b + 1)) >> 256 374 * 375 * which is the fastest method that satisfies the following OpenGL criteria 376 * 377 * 0*0 = 0 and 255*255 = 255 378 * 379 * - geometric series 380 * 381 * takes the geometric series approximation to the division 382 * 383 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 384 * 385 * in this case just the first two terms to fit in 16bit arithmetic 386 * 387 * t/255 ~= (t + (t >> 8)) >> 8 388 * 389 * note that just by itself it doesn't satisfies the OpenGL criteria, as 390 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 391 * must be used 392 * 393 * - geometric series plus rounding 394 * 395 * when using a geometric series division instead of truncating the result 396 * use roundoff in the approximation (Jim Blinn) 397 * 398 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 399 * 400 * achieving the exact results 401 * 402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 403 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 404 * @sa Michael Herf, The "double blend trick", May 2000, 405 * http://www.stereopsis.com/doubleblend.html 406 */ 407static LLVMValueRef 408lp_build_mul_u8n(struct gallivm_state *gallivm, 409 struct lp_type i16_type, 410 LLVMValueRef a, LLVMValueRef b) 411{ 412 LLVMBuilderRef builder = gallivm->builder; 413 LLVMValueRef c8; 414 LLVMValueRef ab; 415 416 assert(!i16_type.floating); 417 assert(lp_check_value(i16_type, a)); 418 assert(lp_check_value(i16_type, b)); 419 420 c8 = lp_build_const_int_vec(gallivm, i16_type, 8); 421 422#if 0 423 424 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 425 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), ""); 426 ab = LLVMBuildMul(builder, a, b, ""); 427 428#else 429 430 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 431 ab = LLVMBuildMul(builder, a, b, ""); 432 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 433 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), ""); 434 435#endif 436 437 ab = LLVMBuildLShr(builder, ab, c8, ""); 438 439 return ab; 440} 441 442 443/** 444 * Generate a * b 445 */ 446LLVMValueRef 447lp_build_mul(struct lp_build_context *bld, 448 LLVMValueRef a, 449 LLVMValueRef b) 450{ 451 LLVMBuilderRef builder = bld->gallivm->builder; 452 const struct lp_type type = bld->type; 453 LLVMValueRef shift; 454 LLVMValueRef res; 455 456 assert(lp_check_value(type, a)); 457 assert(lp_check_value(type, b)); 458 459 if(a == bld->zero) 460 return bld->zero; 461 if(a == bld->one) 462 return b; 463 if(b == bld->zero) 464 return bld->zero; 465 if(b == bld->one) 466 return a; 467 if(a == bld->undef || b == bld->undef) 468 return bld->undef; 469 470 if(!type.floating && !type.fixed && type.norm) { 471 if(type.width == 8) { 472 struct lp_type i16_type = lp_wider_type(type); 473 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 474 475 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah); 476 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh); 477 478 /* PMULLW, PSRLW, PADDW */ 479 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl); 480 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh); 481 482 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh); 483 484 return ab; 485 } 486 487 /* FIXME */ 488 assert(0); 489 } 490 491 if(type.fixed) 492 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 493 else 494 shift = NULL; 495 496 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 497 if (type.floating) 498 res = LLVMConstFMul(a, b); 499 else 500 res = LLVMConstMul(a, b); 501 if(shift) { 502 if(type.sign) 503 res = LLVMConstAShr(res, shift); 504 else 505 res = LLVMConstLShr(res, shift); 506 } 507 } 508 else { 509 if (type.floating) 510 res = LLVMBuildFMul(builder, a, b, ""); 511 else 512 res = LLVMBuildMul(builder, a, b, ""); 513 if(shift) { 514 if(type.sign) 515 res = LLVMBuildAShr(builder, res, shift, ""); 516 else 517 res = LLVMBuildLShr(builder, res, shift, ""); 518 } 519 } 520 521 return res; 522} 523 524 525/** 526 * Small vector x scale multiplication optimization. 527 */ 528LLVMValueRef 529lp_build_mul_imm(struct lp_build_context *bld, 530 LLVMValueRef a, 531 int b) 532{ 533 LLVMBuilderRef builder = bld->gallivm->builder; 534 LLVMValueRef factor; 535 536 assert(lp_check_value(bld->type, a)); 537 538 if(b == 0) 539 return bld->zero; 540 541 if(b == 1) 542 return a; 543 544 if(b == -1) 545 return lp_build_negate(bld, a); 546 547 if(b == 2 && bld->type.floating) 548 return lp_build_add(bld, a, a); 549 550 if(util_is_power_of_two(b)) { 551 unsigned shift = ffs(b) - 1; 552 553 if(bld->type.floating) { 554#if 0 555 /* 556 * Power of two multiplication by directly manipulating the mantissa. 557 * 558 * XXX: This might not be always faster, it will introduce a small error 559 * for multiplication by zero, and it will produce wrong results 560 * for Inf and NaN. 561 */ 562 unsigned mantissa = lp_mantissa(bld->type); 563 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 564 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 565 a = LLVMBuildAdd(builder, a, factor, ""); 566 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 567 return a; 568#endif 569 } 570 else { 571 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 572 return LLVMBuildShl(builder, a, factor, ""); 573 } 574 } 575 576 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 577 return lp_build_mul(bld, a, factor); 578} 579 580 581/** 582 * Generate a / b 583 */ 584LLVMValueRef 585lp_build_div(struct lp_build_context *bld, 586 LLVMValueRef a, 587 LLVMValueRef b) 588{ 589 LLVMBuilderRef builder = bld->gallivm->builder; 590 const struct lp_type type = bld->type; 591 592 assert(lp_check_value(type, a)); 593 assert(lp_check_value(type, b)); 594 595 if(a == bld->zero) 596 return bld->zero; 597 if(a == bld->one) 598 return lp_build_rcp(bld, b); 599 if(b == bld->zero) 600 return bld->undef; 601 if(b == bld->one) 602 return a; 603 if(a == bld->undef || b == bld->undef) 604 return bld->undef; 605 606 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 607 if (type.floating) 608 return LLVMConstFDiv(a, b); 609 else if (type.sign) 610 return LLVMConstSDiv(a, b); 611 else 612 return LLVMConstUDiv(a, b); 613 } 614 615 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 && 616 type.floating) 617 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 618 619 if (type.floating) 620 return LLVMBuildFDiv(builder, a, b, ""); 621 else if (type.sign) 622 return LLVMBuildSDiv(builder, a, b, ""); 623 else 624 return LLVMBuildUDiv(builder, a, b, ""); 625} 626 627 628/** 629 * Linear interpolation -- without any checks. 630 * 631 * @sa http://www.stereopsis.com/doubleblend.html 632 */ 633static INLINE LLVMValueRef 634lp_build_lerp_simple(struct lp_build_context *bld, 635 LLVMValueRef x, 636 LLVMValueRef v0, 637 LLVMValueRef v1) 638{ 639 LLVMBuilderRef builder = bld->gallivm->builder; 640 LLVMValueRef delta; 641 LLVMValueRef res; 642 643 assert(lp_check_value(bld->type, x)); 644 assert(lp_check_value(bld->type, v0)); 645 assert(lp_check_value(bld->type, v1)); 646 647 delta = lp_build_sub(bld, v1, v0); 648 649 res = lp_build_mul(bld, x, delta); 650 651 res = lp_build_add(bld, v0, res); 652 653 if (bld->type.fixed) { 654 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 655 * but it will be wrong for other uses. Basically we need a more 656 * powerful lp_type, capable of further distinguishing the values 657 * interpretation from the value storage. */ 658 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), ""); 659 } 660 661 return res; 662} 663 664 665/** 666 * Linear interpolation. 667 */ 668LLVMValueRef 669lp_build_lerp(struct lp_build_context *bld, 670 LLVMValueRef x, 671 LLVMValueRef v0, 672 LLVMValueRef v1) 673{ 674 LLVMBuilderRef builder = bld->gallivm->builder; 675 const struct lp_type type = bld->type; 676 LLVMValueRef res; 677 678 assert(lp_check_value(type, x)); 679 assert(lp_check_value(type, v0)); 680 assert(lp_check_value(type, v1)); 681 682 if (type.norm) { 683 struct lp_type wide_type; 684 struct lp_build_context wide_bld; 685 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 686 LLVMValueRef shift; 687 688 assert(type.length >= 2); 689 assert(!type.sign); 690 691 /* 692 * Create a wider type, enough to hold the intermediate result of the 693 * multiplication. 694 */ 695 memset(&wide_type, 0, sizeof wide_type); 696 wide_type.fixed = TRUE; 697 wide_type.width = type.width*2; 698 wide_type.length = type.length/2; 699 700 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 701 702 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); 703 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 704 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 705 706 /* 707 * Scale x from [0, 255] to [0, 256] 708 */ 709 710 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1); 711 712 xl = lp_build_add(&wide_bld, xl, 713 LLVMBuildAShr(builder, xl, shift, "")); 714 xh = lp_build_add(&wide_bld, xh, 715 LLVMBuildAShr(builder, xh, shift, "")); 716 717 /* 718 * Lerp both halves. 719 */ 720 721 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); 722 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); 723 724 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); 725 } else { 726 res = lp_build_lerp_simple(bld, x, v0, v1); 727 } 728 729 return res; 730} 731 732 733LLVMValueRef 734lp_build_lerp_2d(struct lp_build_context *bld, 735 LLVMValueRef x, 736 LLVMValueRef y, 737 LLVMValueRef v00, 738 LLVMValueRef v01, 739 LLVMValueRef v10, 740 LLVMValueRef v11) 741{ 742 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 743 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 744 return lp_build_lerp(bld, y, v0, v1); 745} 746 747 748/** 749 * Generate min(a, b) 750 * Do checks for special cases. 751 */ 752LLVMValueRef 753lp_build_min(struct lp_build_context *bld, 754 LLVMValueRef a, 755 LLVMValueRef b) 756{ 757 assert(lp_check_value(bld->type, a)); 758 assert(lp_check_value(bld->type, b)); 759 760 if(a == bld->undef || b == bld->undef) 761 return bld->undef; 762 763 if(a == b) 764 return a; 765 766 if (bld->type.norm) { 767 if (!bld->type.sign) { 768 if (a == bld->zero || b == bld->zero) { 769 return bld->zero; 770 } 771 } 772 if(a == bld->one) 773 return b; 774 if(b == bld->one) 775 return a; 776 } 777 778 return lp_build_min_simple(bld, a, b); 779} 780 781 782/** 783 * Generate max(a, b) 784 * Do checks for special cases. 785 */ 786LLVMValueRef 787lp_build_max(struct lp_build_context *bld, 788 LLVMValueRef a, 789 LLVMValueRef b) 790{ 791 assert(lp_check_value(bld->type, a)); 792 assert(lp_check_value(bld->type, b)); 793 794 if(a == bld->undef || b == bld->undef) 795 return bld->undef; 796 797 if(a == b) 798 return a; 799 800 if(bld->type.norm) { 801 if(a == bld->one || b == bld->one) 802 return bld->one; 803 if (!bld->type.sign) { 804 if (a == bld->zero) { 805 return b; 806 } 807 if (b == bld->zero) { 808 return a; 809 } 810 } 811 } 812 813 return lp_build_max_simple(bld, a, b); 814} 815 816 817/** 818 * Generate clamp(a, min, max) 819 * Do checks for special cases. 820 */ 821LLVMValueRef 822lp_build_clamp(struct lp_build_context *bld, 823 LLVMValueRef a, 824 LLVMValueRef min, 825 LLVMValueRef max) 826{ 827 assert(lp_check_value(bld->type, a)); 828 assert(lp_check_value(bld->type, min)); 829 assert(lp_check_value(bld->type, max)); 830 831 a = lp_build_min(bld, a, max); 832 a = lp_build_max(bld, a, min); 833 return a; 834} 835 836 837/** 838 * Generate abs(a) 839 */ 840LLVMValueRef 841lp_build_abs(struct lp_build_context *bld, 842 LLVMValueRef a) 843{ 844 LLVMBuilderRef builder = bld->gallivm->builder; 845 const struct lp_type type = bld->type; 846 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 847 848 assert(lp_check_value(type, a)); 849 850 if(!type.sign) 851 return a; 852 853 if(type.floating) { 854 /* Mask out the sign bit */ 855 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 856 unsigned long long absMask = ~(1ULL << (type.width - 1)); 857 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 858 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 859 a = LLVMBuildAnd(builder, a, mask, ""); 860 a = LLVMBuildBitCast(builder, a, vec_type, ""); 861 return a; 862 } 863 864 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 865 switch(type.width) { 866 case 8: 867 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 868 case 16: 869 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 870 case 32: 871 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 872 } 873 } 874 875 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 876} 877 878 879LLVMValueRef 880lp_build_negate(struct lp_build_context *bld, 881 LLVMValueRef a) 882{ 883 LLVMBuilderRef builder = bld->gallivm->builder; 884 885 assert(lp_check_value(bld->type, a)); 886 887#if HAVE_LLVM >= 0x0207 888 if (bld->type.floating) 889 a = LLVMBuildFNeg(builder, a, ""); 890 else 891#endif 892 a = LLVMBuildNeg(builder, a, ""); 893 894 return a; 895} 896 897 898/** Return -1, 0 or +1 depending on the sign of a */ 899LLVMValueRef 900lp_build_sgn(struct lp_build_context *bld, 901 LLVMValueRef a) 902{ 903 LLVMBuilderRef builder = bld->gallivm->builder; 904 const struct lp_type type = bld->type; 905 LLVMValueRef cond; 906 LLVMValueRef res; 907 908 assert(lp_check_value(type, a)); 909 910 /* Handle non-zero case */ 911 if(!type.sign) { 912 /* if not zero then sign must be positive */ 913 res = bld->one; 914 } 915 else if(type.floating) { 916 LLVMTypeRef vec_type; 917 LLVMTypeRef int_type; 918 LLVMValueRef mask; 919 LLVMValueRef sign; 920 LLVMValueRef one; 921 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 922 923 int_type = lp_build_int_vec_type(bld->gallivm, type); 924 vec_type = lp_build_vec_type(bld->gallivm, type); 925 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 926 927 /* Take the sign bit and add it to 1 constant */ 928 sign = LLVMBuildBitCast(builder, a, int_type, ""); 929 sign = LLVMBuildAnd(builder, sign, mask, ""); 930 one = LLVMConstBitCast(bld->one, int_type); 931 res = LLVMBuildOr(builder, sign, one, ""); 932 res = LLVMBuildBitCast(builder, res, vec_type, ""); 933 } 934 else 935 { 936 /* signed int/norm/fixed point */ 937 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 938 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 939 res = lp_build_select(bld, cond, bld->one, minus_one); 940 } 941 942 /* Handle zero */ 943 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 944 res = lp_build_select(bld, cond, bld->zero, res); 945 946 return res; 947} 948 949 950/** 951 * Set the sign of float vector 'a' according to 'sign'. 952 * If sign==0, return abs(a). 953 * If sign==1, return -abs(a); 954 * Other values for sign produce undefined results. 955 */ 956LLVMValueRef 957lp_build_set_sign(struct lp_build_context *bld, 958 LLVMValueRef a, LLVMValueRef sign) 959{ 960 LLVMBuilderRef builder = bld->gallivm->builder; 961 const struct lp_type type = bld->type; 962 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 963 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 964 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 965 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 966 ~((unsigned long long) 1 << (type.width - 1))); 967 LLVMValueRef val, res; 968 969 assert(type.floating); 970 assert(lp_check_value(type, a)); 971 972 /* val = reinterpret_cast<int>(a) */ 973 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 974 /* val = val & mask */ 975 val = LLVMBuildAnd(builder, val, mask, ""); 976 /* sign = sign << shift */ 977 sign = LLVMBuildShl(builder, sign, shift, ""); 978 /* res = val | sign */ 979 res = LLVMBuildOr(builder, val, sign, ""); 980 /* res = reinterpret_cast<float>(res) */ 981 res = LLVMBuildBitCast(builder, res, vec_type, ""); 982 983 return res; 984} 985 986 987/** 988 * Convert vector of (or scalar) int to vector of (or scalar) float. 989 */ 990LLVMValueRef 991lp_build_int_to_float(struct lp_build_context *bld, 992 LLVMValueRef a) 993{ 994 LLVMBuilderRef builder = bld->gallivm->builder; 995 const struct lp_type type = bld->type; 996 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 997 998 assert(type.floating); 999 1000 return LLVMBuildSIToFP(builder, a, vec_type, ""); 1001} 1002 1003 1004 1005enum lp_build_round_sse41_mode 1006{ 1007 LP_BUILD_ROUND_SSE41_NEAREST = 0, 1008 LP_BUILD_ROUND_SSE41_FLOOR = 1, 1009 LP_BUILD_ROUND_SSE41_CEIL = 2, 1010 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 1011}; 1012 1013 1014/** 1015 * Helper for SSE4.1's ROUNDxx instructions. 1016 * 1017 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the 1018 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. 1019 */ 1020static INLINE LLVMValueRef 1021lp_build_round_sse41(struct lp_build_context *bld, 1022 LLVMValueRef a, 1023 enum lp_build_round_sse41_mode mode) 1024{ 1025 LLVMBuilderRef builder = bld->gallivm->builder; 1026 const struct lp_type type = bld->type; 1027 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1028 const char *intrinsic; 1029 LLVMValueRef res; 1030 1031 assert(type.floating); 1032 1033 assert(lp_check_value(type, a)); 1034 assert(util_cpu_caps.has_sse4_1); 1035 1036 if (type.length == 1) { 1037 LLVMTypeRef vec_type; 1038 LLVMValueRef undef; 1039 LLVMValueRef args[3]; 1040 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1041 1042 switch(type.width) { 1043 case 32: 1044 intrinsic = "llvm.x86.sse41.round.ss"; 1045 break; 1046 case 64: 1047 intrinsic = "llvm.x86.sse41.round.sd"; 1048 break; 1049 default: 1050 assert(0); 1051 return bld->undef; 1052 } 1053 1054 vec_type = LLVMVectorType(bld->elem_type, 4); 1055 1056 undef = LLVMGetUndef(vec_type); 1057 1058 args[0] = undef; 1059 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1060 args[2] = LLVMConstInt(i32t, mode, 0); 1061 1062 res = lp_build_intrinsic(builder, intrinsic, 1063 vec_type, args, Elements(args)); 1064 1065 res = LLVMBuildExtractElement(builder, res, index0, ""); 1066 } 1067 else { 1068 assert(type.width*type.length == 128); 1069 1070 switch(type.width) { 1071 case 32: 1072 intrinsic = "llvm.x86.sse41.round.ps"; 1073 break; 1074 case 64: 1075 intrinsic = "llvm.x86.sse41.round.pd"; 1076 break; 1077 default: 1078 assert(0); 1079 return bld->undef; 1080 } 1081 1082 res = lp_build_intrinsic_binary(builder, intrinsic, 1083 bld->vec_type, a, 1084 LLVMConstInt(i32t, mode, 0)); 1085 } 1086 1087 return res; 1088} 1089 1090 1091static INLINE LLVMValueRef 1092lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1093 LLVMValueRef a) 1094{ 1095 LLVMBuilderRef builder = bld->gallivm->builder; 1096 const struct lp_type type = bld->type; 1097 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1098 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1099 const char *intrinsic; 1100 LLVMValueRef res; 1101 1102 assert(type.floating); 1103 /* using the double precision conversions is a bit more complicated */ 1104 assert(type.width == 32); 1105 1106 assert(lp_check_value(type, a)); 1107 assert(util_cpu_caps.has_sse2); 1108 1109 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1110 if (type.length == 1) { 1111 LLVMTypeRef vec_type; 1112 LLVMValueRef undef; 1113 LLVMValueRef arg; 1114 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1115 1116 vec_type = LLVMVectorType(bld->elem_type, 4); 1117 1118 intrinsic = "llvm.x86.sse.cvtss2si"; 1119 1120 undef = LLVMGetUndef(vec_type); 1121 1122 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1123 1124 res = lp_build_intrinsic_unary(builder, intrinsic, 1125 ret_type, arg); 1126 } 1127 else { 1128 assert(type.width*type.length == 128); 1129 1130 intrinsic = "llvm.x86.sse2.cvtps2dq"; 1131 1132 res = lp_build_intrinsic_unary(builder, intrinsic, 1133 ret_type, a); 1134 } 1135 1136 return res; 1137} 1138 1139 1140/** 1141 * Return the integer part of a float (vector) value (== round toward zero). 1142 * The returned value is a float (vector). 1143 * Ex: trunc(-1.5) = -1.0 1144 */ 1145LLVMValueRef 1146lp_build_trunc(struct lp_build_context *bld, 1147 LLVMValueRef a) 1148{ 1149 LLVMBuilderRef builder = bld->gallivm->builder; 1150 const struct lp_type type = bld->type; 1151 1152 assert(type.floating); 1153 assert(lp_check_value(type, a)); 1154 1155 if (util_cpu_caps.has_sse4_1 && 1156 (type.length == 1 || type.width*type.length == 128)) { 1157 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 1158 } 1159 else { 1160 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1161 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1162 LLVMValueRef res; 1163 res = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1164 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1165 return res; 1166 } 1167} 1168 1169 1170/** 1171 * Return float (vector) rounded to nearest integer (vector). The returned 1172 * value is a float (vector). 1173 * Ex: round(0.9) = 1.0 1174 * Ex: round(-1.5) = -2.0 1175 */ 1176LLVMValueRef 1177lp_build_round(struct lp_build_context *bld, 1178 LLVMValueRef a) 1179{ 1180 LLVMBuilderRef builder = bld->gallivm->builder; 1181 const struct lp_type type = bld->type; 1182 1183 assert(type.floating); 1184 assert(lp_check_value(type, a)); 1185 1186 if (util_cpu_caps.has_sse4_1 && 1187 (type.length == 1 || type.width*type.length == 128)) { 1188 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1189 } 1190 else { 1191 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1192 LLVMValueRef res; 1193 res = lp_build_iround(bld, a); 1194 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1195 return res; 1196 } 1197} 1198 1199 1200/** 1201 * Return floor of float (vector), result is a float (vector) 1202 * Ex: floor(1.1) = 1.0 1203 * Ex: floor(-1.1) = -2.0 1204 */ 1205LLVMValueRef 1206lp_build_floor(struct lp_build_context *bld, 1207 LLVMValueRef a) 1208{ 1209 LLVMBuilderRef builder = bld->gallivm->builder; 1210 const struct lp_type type = bld->type; 1211 1212 assert(type.floating); 1213 assert(lp_check_value(type, a)); 1214 1215 if (util_cpu_caps.has_sse4_1 && 1216 (type.length == 1 || type.width*type.length == 128)) { 1217 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1218 } 1219 else { 1220 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1221 LLVMValueRef res; 1222 res = lp_build_ifloor(bld, a); 1223 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1224 return res; 1225 } 1226} 1227 1228 1229/** 1230 * Return ceiling of float (vector), returning float (vector). 1231 * Ex: ceil( 1.1) = 2.0 1232 * Ex: ceil(-1.1) = -1.0 1233 */ 1234LLVMValueRef 1235lp_build_ceil(struct lp_build_context *bld, 1236 LLVMValueRef a) 1237{ 1238 LLVMBuilderRef builder = bld->gallivm->builder; 1239 const struct lp_type type = bld->type; 1240 1241 assert(type.floating); 1242 assert(lp_check_value(type, a)); 1243 1244 if (util_cpu_caps.has_sse4_1 && 1245 (type.length == 1 || type.width*type.length == 128)) { 1246 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1247 } 1248 else { 1249 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1250 LLVMValueRef res; 1251 res = lp_build_iceil(bld, a); 1252 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1253 return res; 1254 } 1255} 1256 1257 1258/** 1259 * Return fractional part of 'a' computed as a - floor(a) 1260 * Typically used in texture coord arithmetic. 1261 */ 1262LLVMValueRef 1263lp_build_fract(struct lp_build_context *bld, 1264 LLVMValueRef a) 1265{ 1266 assert(bld->type.floating); 1267 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1268} 1269 1270 1271/** 1272 * Return the integer part of a float (vector) value (== round toward zero). 1273 * The returned value is an integer (vector). 1274 * Ex: itrunc(-1.5) = -1 1275 */ 1276LLVMValueRef 1277lp_build_itrunc(struct lp_build_context *bld, 1278 LLVMValueRef a) 1279{ 1280 LLVMBuilderRef builder = bld->gallivm->builder; 1281 const struct lp_type type = bld->type; 1282 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1283 1284 assert(type.floating); 1285 assert(lp_check_value(type, a)); 1286 1287 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1288} 1289 1290 1291/** 1292 * Return float (vector) rounded to nearest integer (vector). The returned 1293 * value is an integer (vector). 1294 * Ex: iround(0.9) = 1 1295 * Ex: iround(-1.5) = -2 1296 */ 1297LLVMValueRef 1298lp_build_iround(struct lp_build_context *bld, 1299 LLVMValueRef a) 1300{ 1301 LLVMBuilderRef builder = bld->gallivm->builder; 1302 const struct lp_type type = bld->type; 1303 LLVMTypeRef int_vec_type = bld->int_vec_type; 1304 LLVMValueRef res; 1305 1306 assert(type.floating); 1307 1308 assert(lp_check_value(type, a)); 1309 1310 if (util_cpu_caps.has_sse2 && 1311 ((type.width == 32) && (type.length == 1 || type.length == 4))) { 1312 return lp_build_iround_nearest_sse2(bld, a); 1313 } 1314 else if (util_cpu_caps.has_sse4_1 && 1315 (type.length == 1 || type.width*type.length == 128)) { 1316 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1317 } 1318 else { 1319 LLVMValueRef half; 1320 1321 half = lp_build_const_vec(bld->gallivm, type, 0.5); 1322 1323 if (type.sign) { 1324 LLVMTypeRef vec_type = bld->vec_type; 1325 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1326 (unsigned long long)1 << (type.width - 1)); 1327 LLVMValueRef sign; 1328 1329 /* get sign bit */ 1330 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1331 sign = LLVMBuildAnd(builder, sign, mask, ""); 1332 1333 /* sign * 0.5 */ 1334 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 1335 half = LLVMBuildOr(builder, sign, half, ""); 1336 half = LLVMBuildBitCast(builder, half, vec_type, ""); 1337 } 1338 1339 res = LLVMBuildFAdd(builder, a, half, ""); 1340 } 1341 1342 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 1343 1344 return res; 1345} 1346 1347 1348/** 1349 * Return floor of float (vector), result is an int (vector) 1350 * Ex: ifloor(1.1) = 1.0 1351 * Ex: ifloor(-1.1) = -2.0 1352 */ 1353LLVMValueRef 1354lp_build_ifloor(struct lp_build_context *bld, 1355 LLVMValueRef a) 1356{ 1357 LLVMBuilderRef builder = bld->gallivm->builder; 1358 const struct lp_type type = bld->type; 1359 LLVMTypeRef int_vec_type = bld->int_vec_type; 1360 LLVMValueRef res; 1361 1362 assert(type.floating); 1363 assert(lp_check_value(type, a)); 1364 1365 if (util_cpu_caps.has_sse4_1 && 1366 (type.length == 1 || type.width*type.length == 128)) { 1367 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1368 } 1369 else { 1370 res = a; 1371 1372 if (type.sign) { 1373 /* Take the sign bit and add it to 1 constant */ 1374 LLVMTypeRef vec_type = bld->vec_type; 1375 unsigned mantissa = lp_mantissa(type); 1376 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1377 (unsigned long long)1 << (type.width - 1)); 1378 LLVMValueRef sign; 1379 LLVMValueRef offset; 1380 1381 /* sign = a < 0 ? ~0 : 0 */ 1382 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1383 sign = LLVMBuildAnd(builder, sign, mask, ""); 1384 sign = LLVMBuildAShr(builder, sign, 1385 lp_build_const_int_vec(bld->gallivm, type, 1386 type.width - 1), 1387 "ifloor.sign"); 1388 1389 /* offset = -0.99999(9)f */ 1390 offset = lp_build_const_vec(bld->gallivm, type, 1391 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1392 offset = LLVMConstBitCast(offset, int_vec_type); 1393 1394 /* offset = a < 0 ? offset : 0.0f */ 1395 offset = LLVMBuildAnd(builder, offset, sign, ""); 1396 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset"); 1397 1398 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res"); 1399 } 1400 } 1401 1402 /* round to nearest (toward zero) */ 1403 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 1404 1405 return res; 1406} 1407 1408 1409/** 1410 * Return ceiling of float (vector), returning int (vector). 1411 * Ex: iceil( 1.1) = 2 1412 * Ex: iceil(-1.1) = -1 1413 */ 1414LLVMValueRef 1415lp_build_iceil(struct lp_build_context *bld, 1416 LLVMValueRef a) 1417{ 1418 LLVMBuilderRef builder = bld->gallivm->builder; 1419 const struct lp_type type = bld->type; 1420 LLVMTypeRef int_vec_type = bld->int_vec_type; 1421 LLVMValueRef res; 1422 1423 assert(type.floating); 1424 assert(lp_check_value(type, a)); 1425 1426 if (util_cpu_caps.has_sse4_1 && 1427 (type.length == 1 || type.width*type.length == 128)) { 1428 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1429 } 1430 else { 1431 LLVMTypeRef vec_type = bld->vec_type; 1432 unsigned mantissa = lp_mantissa(type); 1433 LLVMValueRef offset; 1434 1435 /* offset = 0.99999(9)f */ 1436 offset = lp_build_const_vec(bld->gallivm, type, 1437 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1438 1439 if (type.sign) { 1440 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1441 (unsigned long long)1 << (type.width - 1)); 1442 LLVMValueRef sign; 1443 1444 /* sign = a < 0 ? 0 : ~0 */ 1445 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1446 sign = LLVMBuildAnd(builder, sign, mask, ""); 1447 sign = LLVMBuildAShr(builder, sign, 1448 lp_build_const_int_vec(bld->gallivm, type, 1449 type.width - 1), 1450 "iceil.sign"); 1451 sign = LLVMBuildNot(builder, sign, "iceil.not"); 1452 1453 /* offset = a < 0 ? 0.0 : offset */ 1454 offset = LLVMConstBitCast(offset, int_vec_type); 1455 offset = LLVMBuildAnd(builder, offset, sign, ""); 1456 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset"); 1457 } 1458 1459 res = LLVMBuildFAdd(builder, a, offset, "iceil.res"); 1460 } 1461 1462 /* round to nearest (toward zero) */ 1463 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 1464 1465 return res; 1466} 1467 1468 1469/** 1470 * Combined ifloor() & fract(). 1471 * 1472 * Preferred to calling the functions separately, as it will ensure that the 1473 * stratergy (floor() vs ifloor()) that results in less redundant work is used. 1474 */ 1475void 1476lp_build_ifloor_fract(struct lp_build_context *bld, 1477 LLVMValueRef a, 1478 LLVMValueRef *out_ipart, 1479 LLVMValueRef *out_fpart) 1480{ 1481 LLVMBuilderRef builder = bld->gallivm->builder; 1482 const struct lp_type type = bld->type; 1483 LLVMValueRef ipart; 1484 1485 assert(type.floating); 1486 assert(lp_check_value(type, a)); 1487 1488 if (util_cpu_caps.has_sse4_1 && 1489 (type.length == 1 || type.width*type.length == 128)) { 1490 /* 1491 * floor() is easier. 1492 */ 1493 1494 ipart = lp_build_floor(bld, a); 1495 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1496 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 1497 } 1498 else { 1499 /* 1500 * ifloor() is easier. 1501 */ 1502 1503 *out_ipart = lp_build_ifloor(bld, a); 1504 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 1505 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1506 } 1507} 1508 1509 1510LLVMValueRef 1511lp_build_sqrt(struct lp_build_context *bld, 1512 LLVMValueRef a) 1513{ 1514 LLVMBuilderRef builder = bld->gallivm->builder; 1515 const struct lp_type type = bld->type; 1516 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1517 char intrinsic[32]; 1518 1519 assert(lp_check_value(type, a)); 1520 1521 /* TODO: optimize the constant case */ 1522 /* TODO: optimize the constant case */ 1523 1524 assert(type.floating); 1525 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1526 1527 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1528} 1529 1530 1531/** 1532 * Do one Newton-Raphson step to improve reciprocate precision: 1533 * 1534 * x_{i+1} = x_i * (2 - a * x_i) 1535 * 1536 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 1537 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 1538 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 1539 * halo. It would be necessary to clamp the argument to prevent this. 1540 * 1541 * See also: 1542 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 1543 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1544 */ 1545static INLINE LLVMValueRef 1546lp_build_rcp_refine(struct lp_build_context *bld, 1547 LLVMValueRef a, 1548 LLVMValueRef rcp_a) 1549{ 1550 LLVMBuilderRef builder = bld->gallivm->builder; 1551 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 1552 LLVMValueRef res; 1553 1554 res = LLVMBuildFMul(builder, a, rcp_a, ""); 1555 res = LLVMBuildFSub(builder, two, res, ""); 1556 res = LLVMBuildFMul(builder, rcp_a, res, ""); 1557 1558 return res; 1559} 1560 1561 1562LLVMValueRef 1563lp_build_rcp(struct lp_build_context *bld, 1564 LLVMValueRef a) 1565{ 1566 LLVMBuilderRef builder = bld->gallivm->builder; 1567 const struct lp_type type = bld->type; 1568 1569 assert(lp_check_value(type, a)); 1570 1571 if(a == bld->zero) 1572 return bld->undef; 1573 if(a == bld->one) 1574 return bld->one; 1575 if(a == bld->undef) 1576 return bld->undef; 1577 1578 assert(type.floating); 1579 1580 if(LLVMIsConstant(a)) 1581 return LLVMConstFDiv(bld->one, a); 1582 1583 /* 1584 * We don't use RCPPS because: 1585 * - it only has 10bits of precision 1586 * - it doesn't even get the reciprocate of 1.0 exactly 1587 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 1588 * - for recent processors the benefit over DIVPS is marginal, a case 1589 * depedent 1590 * 1591 * We could still use it on certain processors if benchmarks show that the 1592 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 1593 * particular uses that require less workarounds. 1594 */ 1595 1596 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1597 const unsigned num_iterations = 0; 1598 LLVMValueRef res; 1599 unsigned i; 1600 1601 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); 1602 1603 for (i = 0; i < num_iterations; ++i) { 1604 res = lp_build_rcp_refine(bld, a, res); 1605 } 1606 1607 return res; 1608 } 1609 1610 return LLVMBuildFDiv(builder, bld->one, a, ""); 1611} 1612 1613 1614/** 1615 * Do one Newton-Raphson step to improve rsqrt precision: 1616 * 1617 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 1618 * 1619 * See also: 1620 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1621 */ 1622static INLINE LLVMValueRef 1623lp_build_rsqrt_refine(struct lp_build_context *bld, 1624 LLVMValueRef a, 1625 LLVMValueRef rsqrt_a) 1626{ 1627 LLVMBuilderRef builder = bld->gallivm->builder; 1628 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 1629 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 1630 LLVMValueRef res; 1631 1632 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 1633 res = LLVMBuildFMul(builder, a, res, ""); 1634 res = LLVMBuildFSub(builder, three, res, ""); 1635 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 1636 res = LLVMBuildFMul(builder, half, res, ""); 1637 1638 return res; 1639} 1640 1641 1642/** 1643 * Generate 1/sqrt(a) 1644 */ 1645LLVMValueRef 1646lp_build_rsqrt(struct lp_build_context *bld, 1647 LLVMValueRef a) 1648{ 1649 LLVMBuilderRef builder = bld->gallivm->builder; 1650 const struct lp_type type = bld->type; 1651 1652 assert(lp_check_value(type, a)); 1653 1654 assert(type.floating); 1655 1656 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1657 const unsigned num_iterations = 1; 1658 LLVMValueRef res; 1659 unsigned i; 1660 1661 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); 1662 1663 for (i = 0; i < num_iterations; ++i) { 1664 res = lp_build_rsqrt_refine(bld, a, res); 1665 } 1666 1667 return res; 1668 } 1669 1670 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1671} 1672 1673 1674/** 1675 * Generate sin(a) using SSE2 1676 */ 1677LLVMValueRef 1678lp_build_sin(struct lp_build_context *bld, 1679 LLVMValueRef a) 1680{ 1681 struct gallivm_state *gallivm = bld->gallivm; 1682 LLVMBuilderRef builder = gallivm->builder; 1683 struct lp_type int_type = lp_int_type(bld->type); 1684 LLVMBuilderRef b = builder; 1685 1686 /* 1687 * take the absolute value, 1688 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1689 */ 1690 1691 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 1692 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 1693 1694 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1695 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 1696 1697 /* 1698 * extract the sign bit (upper one) 1699 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 1700 */ 1701 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 1702 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 1703 1704 /* 1705 * scale by 4/Pi 1706 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1707 */ 1708 1709 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 1710 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1711 1712 /* 1713 * store the integer part of y in mm0 1714 * emm2 = _mm_cvttps_epi32(y); 1715 */ 1716 1717 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 1718 1719 /* 1720 * j=(j+1) & (~1) (see the cephes sources) 1721 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1722 */ 1723 1724 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 1725 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1726 /* 1727 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1728 */ 1729 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 1730 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1731 1732 /* 1733 * y = _mm_cvtepi32_ps(emm2); 1734 */ 1735 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 1736 1737 /* get the swap sign flag 1738 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 1739 */ 1740 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 1741 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 1742 1743 /* 1744 * emm2 = _mm_slli_epi32(emm0, 29); 1745 */ 1746 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 1747 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 1748 1749 /* 1750 * get the polynom selection mask 1751 * there is one polynom for 0 <= x <= Pi/4 1752 * and another one for Pi/4<x<=Pi/2 1753 * Both branches will be computed. 1754 * 1755 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1756 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1757 */ 1758 1759 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1760 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 1761 LLVMValueRef poly_mask = lp_build_compare(gallivm, 1762 int_type, PIPE_FUNC_EQUAL, 1763 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 1764 /* 1765 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 1766 */ 1767 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 1768 1769 /* 1770 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1771 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1772 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1773 */ 1774 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 1775 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 1776 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 1777 1778 /* 1779 * The magic pass: "Extended precision modular arithmetic" 1780 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1781 * xmm1 = _mm_mul_ps(y, xmm1); 1782 * xmm2 = _mm_mul_ps(y, xmm2); 1783 * xmm3 = _mm_mul_ps(y, xmm3); 1784 */ 1785 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1786 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1787 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1788 1789 /* 1790 * x = _mm_add_ps(x, xmm1); 1791 * x = _mm_add_ps(x, xmm2); 1792 * x = _mm_add_ps(x, xmm3); 1793 */ 1794 1795 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 1796 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 1797 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 1798 1799 /* 1800 * Evaluate the first polynom (0 <= x <= Pi/4) 1801 * 1802 * z = _mm_mul_ps(x,x); 1803 */ 1804 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 1805 1806 /* 1807 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1808 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1809 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1810 */ 1811 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 1812 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 1813 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 1814 1815 /* 1816 * y = *(v4sf*)_ps_coscof_p0; 1817 * y = _mm_mul_ps(y, z); 1818 */ 1819 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 1820 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 1821 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 1822 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 1823 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 1824 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 1825 1826 1827 /* 1828 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1829 * y = _mm_sub_ps(y, tmp); 1830 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1831 */ 1832 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 1833 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 1834 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 1835 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 1836 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 1837 1838 /* 1839 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1840 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1841 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1842 */ 1843 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 1844 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 1845 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 1846 1847 /* 1848 * Evaluate the second polynom (Pi/4 <= x <= 0) 1849 * 1850 * y2 = *(v4sf*)_ps_sincof_p0; 1851 * y2 = _mm_mul_ps(y2, z); 1852 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1853 * y2 = _mm_mul_ps(y2, z); 1854 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1855 * y2 = _mm_mul_ps(y2, z); 1856 * y2 = _mm_mul_ps(y2, x); 1857 * y2 = _mm_add_ps(y2, x); 1858 */ 1859 1860 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 1861 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 1862 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 1863 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 1864 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 1865 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 1866 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 1867 1868 /* 1869 * select the correct result from the two polynoms 1870 * xmm3 = poly_mask; 1871 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1872 * y = _mm_andnot_ps(xmm3, y); 1873 * y = _mm_add_ps(y,y2); 1874 */ 1875 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 1876 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 1877 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1878 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 1879 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1880 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1881 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1882 1883 /* 1884 * update the sign 1885 * y = _mm_xor_ps(y, sign_bit); 1886 */ 1887 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 1888 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 1889 return y_result; 1890} 1891 1892 1893/** 1894 * Generate cos(a) using SSE2 1895 */ 1896LLVMValueRef 1897lp_build_cos(struct lp_build_context *bld, 1898 LLVMValueRef a) 1899{ 1900 struct gallivm_state *gallivm = bld->gallivm; 1901 LLVMBuilderRef builder = gallivm->builder; 1902 struct lp_type int_type = lp_int_type(bld->type); 1903 LLVMBuilderRef b = builder; 1904 1905 /* 1906 * take the absolute value, 1907 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1908 */ 1909 1910 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 1911 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 1912 1913 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1914 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 1915 1916 /* 1917 * scale by 4/Pi 1918 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1919 */ 1920 1921 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 1922 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1923 1924 /* 1925 * store the integer part of y in mm0 1926 * emm2 = _mm_cvttps_epi32(y); 1927 */ 1928 1929 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 1930 1931 /* 1932 * j=(j+1) & (~1) (see the cephes sources) 1933 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1934 */ 1935 1936 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 1937 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1938 /* 1939 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1940 */ 1941 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 1942 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1943 1944 /* 1945 * y = _mm_cvtepi32_ps(emm2); 1946 */ 1947 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 1948 1949 1950 /* 1951 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 1952 */ 1953 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1954 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 1955 1956 1957 /* get the swap sign flag 1958 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 1959 */ 1960 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 1961 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 1962 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 1963 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 1964 1965 /* 1966 * emm2 = _mm_slli_epi32(emm0, 29); 1967 */ 1968 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 1969 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 1970 1971 /* 1972 * get the polynom selection mask 1973 * there is one polynom for 0 <= x <= Pi/4 1974 * and another one for Pi/4<x<=Pi/2 1975 * Both branches will be computed. 1976 * 1977 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1978 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1979 */ 1980 1981 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1982 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 1983 LLVMValueRef poly_mask = lp_build_compare(gallivm, 1984 int_type, PIPE_FUNC_EQUAL, 1985 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 1986 1987 /* 1988 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1989 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1990 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1991 */ 1992 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 1993 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 1994 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 1995 1996 /* 1997 * The magic pass: "Extended precision modular arithmetic" 1998 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1999 * xmm1 = _mm_mul_ps(y, xmm1); 2000 * xmm2 = _mm_mul_ps(y, xmm2); 2001 * xmm3 = _mm_mul_ps(y, xmm3); 2002 */ 2003 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 2004 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 2005 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 2006 2007 /* 2008 * x = _mm_add_ps(x, xmm1); 2009 * x = _mm_add_ps(x, xmm2); 2010 * x = _mm_add_ps(x, xmm3); 2011 */ 2012 2013 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2014 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2015 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2016 2017 /* 2018 * Evaluate the first polynom (0 <= x <= Pi/4) 2019 * 2020 * z = _mm_mul_ps(x,x); 2021 */ 2022 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2023 2024 /* 2025 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2026 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2027 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2028 */ 2029 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2030 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2031 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2032 2033 /* 2034 * y = *(v4sf*)_ps_coscof_p0; 2035 * y = _mm_mul_ps(y, z); 2036 */ 2037 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2038 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2039 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2040 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2041 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2042 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2043 2044 2045 /* 2046 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2047 * y = _mm_sub_ps(y, tmp); 2048 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2049 */ 2050 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2051 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2052 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2053 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2054 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2055 2056 /* 2057 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2058 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2059 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2060 */ 2061 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2062 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2063 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2064 2065 /* 2066 * Evaluate the second polynom (Pi/4 <= x <= 0) 2067 * 2068 * y2 = *(v4sf*)_ps_sincof_p0; 2069 * y2 = _mm_mul_ps(y2, z); 2070 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2071 * y2 = _mm_mul_ps(y2, z); 2072 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2073 * y2 = _mm_mul_ps(y2, z); 2074 * y2 = _mm_mul_ps(y2, x); 2075 * y2 = _mm_add_ps(y2, x); 2076 */ 2077 2078 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2079 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2080 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2081 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2082 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2083 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2084 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2085 2086 /* 2087 * select the correct result from the two polynoms 2088 * xmm3 = poly_mask; 2089 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2090 * y = _mm_andnot_ps(xmm3, y); 2091 * y = _mm_add_ps(y,y2); 2092 */ 2093 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2094 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2095 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2096 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2097 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2098 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2099 2100 /* 2101 * update the sign 2102 * y = _mm_xor_ps(y, sign_bit); 2103 */ 2104 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 2105 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2106 return y_result; 2107} 2108 2109 2110/** 2111 * Generate pow(x, y) 2112 */ 2113LLVMValueRef 2114lp_build_pow(struct lp_build_context *bld, 2115 LLVMValueRef x, 2116 LLVMValueRef y) 2117{ 2118 /* TODO: optimize the constant case */ 2119 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2120 LLVMIsConstant(x) && LLVMIsConstant(y)) { 2121 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2122 __FUNCTION__); 2123 } 2124 2125 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 2126} 2127 2128 2129/** 2130 * Generate exp(x) 2131 */ 2132LLVMValueRef 2133lp_build_exp(struct lp_build_context *bld, 2134 LLVMValueRef x) 2135{ 2136 /* log2(e) = 1/log(2) */ 2137 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 2138 1.4426950408889634); 2139 2140 assert(lp_check_value(bld->type, x)); 2141 2142 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 2143} 2144 2145 2146/** 2147 * Generate log(x) 2148 */ 2149LLVMValueRef 2150lp_build_log(struct lp_build_context *bld, 2151 LLVMValueRef x) 2152{ 2153 /* log(2) */ 2154 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 2155 0.69314718055994529); 2156 2157 assert(lp_check_value(bld->type, x)); 2158 2159 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 2160} 2161 2162 2163/** 2164 * Generate polynomial. 2165 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 2166 */ 2167static LLVMValueRef 2168lp_build_polynomial(struct lp_build_context *bld, 2169 LLVMValueRef x, 2170 const double *coeffs, 2171 unsigned num_coeffs) 2172{ 2173 const struct lp_type type = bld->type; 2174 LLVMValueRef even = NULL, odd = NULL; 2175 LLVMValueRef x2; 2176 unsigned i; 2177 2178 assert(lp_check_value(bld->type, x)); 2179 2180 /* TODO: optimize the constant case */ 2181 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2182 LLVMIsConstant(x)) { 2183 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2184 __FUNCTION__); 2185 } 2186 2187 /* 2188 * Calculate odd and even terms seperately to decrease data dependency 2189 * Ex: 2190 * c[0] + x^2 * c[2] + x^4 * c[4] ... 2191 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 2192 */ 2193 x2 = lp_build_mul(bld, x, x); 2194 2195 for (i = num_coeffs; i--; ) { 2196 LLVMValueRef coeff; 2197 2198 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 2199 2200 if (i % 2 == 0) { 2201 if (even) 2202 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even)); 2203 else 2204 even = coeff; 2205 } else { 2206 if (odd) 2207 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd)); 2208 else 2209 odd = coeff; 2210 } 2211 } 2212 2213 if (odd) 2214 return lp_build_add(bld, lp_build_mul(bld, odd, x), even); 2215 else if (even) 2216 return even; 2217 else 2218 return bld->undef; 2219} 2220 2221 2222/** 2223 * Minimax polynomial fit of 2**x, in range [0, 1[ 2224 */ 2225const double lp_build_exp2_polynomial[] = { 2226#if EXP_POLY_DEGREE == 5 2227 0.999999925063526176901, 2228 0.693153073200168932794, 2229 0.240153617044375388211, 2230 0.0558263180532956664775, 2231 0.00898934009049466391101, 2232 0.00187757667519147912699 2233#elif EXP_POLY_DEGREE == 4 2234 1.00000259337069434683, 2235 0.693003834469974940458, 2236 0.24144275689150793076, 2237 0.0520114606103070150235, 2238 0.0135341679161270268764 2239#elif EXP_POLY_DEGREE == 3 2240 0.999925218562710312959, 2241 0.695833540494823811697, 2242 0.226067155427249155588, 2243 0.0780245226406372992967 2244#elif EXP_POLY_DEGREE == 2 2245 1.00172476321474503578, 2246 0.657636275736077639316, 2247 0.33718943461968720704 2248#else 2249#error 2250#endif 2251}; 2252 2253 2254void 2255lp_build_exp2_approx(struct lp_build_context *bld, 2256 LLVMValueRef x, 2257 LLVMValueRef *p_exp2_int_part, 2258 LLVMValueRef *p_frac_part, 2259 LLVMValueRef *p_exp2) 2260{ 2261 LLVMBuilderRef builder = bld->gallivm->builder; 2262 const struct lp_type type = bld->type; 2263 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2264 LLVMValueRef ipart = NULL; 2265 LLVMValueRef fpart = NULL; 2266 LLVMValueRef expipart = NULL; 2267 LLVMValueRef expfpart = NULL; 2268 LLVMValueRef res = NULL; 2269 2270 assert(lp_check_value(bld->type, x)); 2271 2272 if(p_exp2_int_part || p_frac_part || p_exp2) { 2273 /* TODO: optimize the constant case */ 2274 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2275 LLVMIsConstant(x)) { 2276 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2277 __FUNCTION__); 2278 } 2279 2280 assert(type.floating && type.width == 32); 2281 2282 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0)); 2283 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999)); 2284 2285 /* ipart = floor(x) */ 2286 /* fpart = x - ipart */ 2287 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 2288 } 2289 2290 if(p_exp2_int_part || p_exp2) { 2291 /* expipart = (float) (1 << ipart) */ 2292 expipart = LLVMBuildAdd(builder, ipart, 2293 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2294 expipart = LLVMBuildShl(builder, expipart, 2295 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2296 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 2297 } 2298 2299 if(p_exp2) { 2300 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 2301 Elements(lp_build_exp2_polynomial)); 2302 2303 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 2304 } 2305 2306 if(p_exp2_int_part) 2307 *p_exp2_int_part = expipart; 2308 2309 if(p_frac_part) 2310 *p_frac_part = fpart; 2311 2312 if(p_exp2) 2313 *p_exp2 = res; 2314} 2315 2316 2317LLVMValueRef 2318lp_build_exp2(struct lp_build_context *bld, 2319 LLVMValueRef x) 2320{ 2321 LLVMValueRef res; 2322 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 2323 return res; 2324} 2325 2326 2327/** 2328 * Extract the exponent of a IEEE-754 floating point value. 2329 * 2330 * Optionally apply an integer bias. 2331 * 2332 * Result is an integer value with 2333 * 2334 * ifloor(log2(x)) + bias 2335 */ 2336LLVMValueRef 2337lp_build_extract_exponent(struct lp_build_context *bld, 2338 LLVMValueRef x, 2339 int bias) 2340{ 2341 LLVMBuilderRef builder = bld->gallivm->builder; 2342 const struct lp_type type = bld->type; 2343 unsigned mantissa = lp_mantissa(type); 2344 LLVMValueRef res; 2345 2346 assert(type.floating); 2347 2348 assert(lp_check_value(bld->type, x)); 2349 2350 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2351 2352 res = LLVMBuildLShr(builder, x, 2353 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 2354 res = LLVMBuildAnd(builder, res, 2355 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 2356 res = LLVMBuildSub(builder, res, 2357 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 2358 2359 return res; 2360} 2361 2362 2363/** 2364 * Extract the mantissa of the a floating. 2365 * 2366 * Result is a floating point value with 2367 * 2368 * x / floor(log2(x)) 2369 */ 2370LLVMValueRef 2371lp_build_extract_mantissa(struct lp_build_context *bld, 2372 LLVMValueRef x) 2373{ 2374 LLVMBuilderRef builder = bld->gallivm->builder; 2375 const struct lp_type type = bld->type; 2376 unsigned mantissa = lp_mantissa(type); 2377 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 2378 (1ULL << mantissa) - 1); 2379 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 2380 LLVMValueRef res; 2381 2382 assert(lp_check_value(bld->type, x)); 2383 2384 assert(type.floating); 2385 2386 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2387 2388 /* res = x / 2**ipart */ 2389 res = LLVMBuildAnd(builder, x, mantmask, ""); 2390 res = LLVMBuildOr(builder, res, one, ""); 2391 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 2392 2393 return res; 2394} 2395 2396 2397 2398/** 2399 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 2400 * These coefficients can be generate with 2401 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 2402 */ 2403const double lp_build_log2_polynomial[] = { 2404#if LOG_POLY_DEGREE == 5 2405 2.88539008148777786488L, 2406 0.961796878841293367824L, 2407 0.577058946784739859012L, 2408 0.412914355135828735411L, 2409 0.308591899232910175289L, 2410 0.352376952300281371868L, 2411#elif LOG_POLY_DEGREE == 4 2412 2.88539009343309178325L, 2413 0.961791550404184197881L, 2414 0.577440339438736392009L, 2415 0.403343858251329912514L, 2416 0.406718052498846252698L, 2417#elif LOG_POLY_DEGREE == 3 2418 2.88538959748872753838L, 2419 0.961932915889597772928L, 2420 0.571118517972136195241L, 2421 0.493997535084709500285L, 2422#else 2423#error 2424#endif 2425}; 2426 2427/** 2428 * See http://www.devmaster.net/forums/showthread.php?p=43580 2429 * http://en.wikipedia.org/wiki/Logarithm#Calculation 2430 * http://www.nezumi.demon.co.uk/consult/logx.htm 2431 */ 2432void 2433lp_build_log2_approx(struct lp_build_context *bld, 2434 LLVMValueRef x, 2435 LLVMValueRef *p_exp, 2436 LLVMValueRef *p_floor_log2, 2437 LLVMValueRef *p_log2) 2438{ 2439 LLVMBuilderRef builder = bld->gallivm->builder; 2440 const struct lp_type type = bld->type; 2441 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2442 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2443 2444 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 2445 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 2446 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 2447 2448 LLVMValueRef i = NULL; 2449 LLVMValueRef y = NULL; 2450 LLVMValueRef z = NULL; 2451 LLVMValueRef exp = NULL; 2452 LLVMValueRef mant = NULL; 2453 LLVMValueRef logexp = NULL; 2454 LLVMValueRef logmant = NULL; 2455 LLVMValueRef res = NULL; 2456 2457 assert(lp_check_value(bld->type, x)); 2458 2459 if(p_exp || p_floor_log2 || p_log2) { 2460 /* TODO: optimize the constant case */ 2461 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2462 LLVMIsConstant(x)) { 2463 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2464 __FUNCTION__); 2465 } 2466 2467 assert(type.floating && type.width == 32); 2468 2469 /* 2470 * We don't explicitly handle denormalized numbers. They will yield a 2471 * result in the neighbourhood of -127, which appears to be adequate 2472 * enough. 2473 */ 2474 2475 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 2476 2477 /* exp = (float) exponent(x) */ 2478 exp = LLVMBuildAnd(builder, i, expmask, ""); 2479 } 2480 2481 if(p_floor_log2 || p_log2) { 2482 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2483 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2484 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 2485 } 2486 2487 if(p_log2) { 2488 /* mant = 1 + (float) mantissa(x) */ 2489 mant = LLVMBuildAnd(builder, i, mantmask, ""); 2490 mant = LLVMBuildOr(builder, mant, one, ""); 2491 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 2492 2493 /* y = (mant - 1) / (mant + 1) */ 2494 y = lp_build_div(bld, 2495 lp_build_sub(bld, mant, bld->one), 2496 lp_build_add(bld, mant, bld->one) 2497 ); 2498 2499 /* z = y^2 */ 2500 z = lp_build_mul(bld, y, y); 2501 2502 /* compute P(z) */ 2503 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 2504 Elements(lp_build_log2_polynomial)); 2505 2506 /* logmant = y * P(z) */ 2507 logmant = lp_build_mul(bld, y, logmant); 2508 2509 res = lp_build_add(bld, logmant, logexp); 2510 } 2511 2512 if(p_exp) { 2513 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 2514 *p_exp = exp; 2515 } 2516 2517 if(p_floor_log2) 2518 *p_floor_log2 = logexp; 2519 2520 if(p_log2) 2521 *p_log2 = res; 2522} 2523 2524 2525LLVMValueRef 2526lp_build_log2(struct lp_build_context *bld, 2527 LLVMValueRef x) 2528{ 2529 LLVMValueRef res; 2530 lp_build_log2_approx(bld, x, NULL, NULL, &res); 2531 return res; 2532} 2533 2534 2535/** 2536 * Faster (and less accurate) log2. 2537 * 2538 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 2539 * 2540 * Piece-wise linear approximation, with exact results when x is a 2541 * power of two. 2542 * 2543 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 2544 */ 2545LLVMValueRef 2546lp_build_fast_log2(struct lp_build_context *bld, 2547 LLVMValueRef x) 2548{ 2549 LLVMBuilderRef builder = bld->gallivm->builder; 2550 LLVMValueRef ipart; 2551 LLVMValueRef fpart; 2552 2553 assert(lp_check_value(bld->type, x)); 2554 2555 assert(bld->type.floating); 2556 2557 /* ipart = floor(log2(x)) - 1 */ 2558 ipart = lp_build_extract_exponent(bld, x, -1); 2559 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 2560 2561 /* fpart = x / 2**ipart */ 2562 fpart = lp_build_extract_mantissa(bld, x); 2563 2564 /* ipart + fpart */ 2565 return LLVMBuildFAdd(builder, ipart, fpart, ""); 2566} 2567 2568 2569/** 2570 * Fast implementation of iround(log2(x)). 2571 * 2572 * Not an approximation -- it should give accurate results all the time. 2573 */ 2574LLVMValueRef 2575lp_build_ilog2(struct lp_build_context *bld, 2576 LLVMValueRef x) 2577{ 2578 LLVMBuilderRef builder = bld->gallivm->builder; 2579 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 2580 LLVMValueRef ipart; 2581 2582 assert(bld->type.floating); 2583 2584 assert(lp_check_value(bld->type, x)); 2585 2586 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 2587 x = LLVMBuildFMul(builder, x, sqrt2, ""); 2588 2589 /* ipart = floor(log2(x) + 0.5) */ 2590 ipart = lp_build_extract_exponent(bld, x, 0); 2591 2592 return ipart; 2593} 2594 2595LLVMValueRef 2596lp_build_mod(struct lp_build_context *bld, 2597 LLVMValueRef x, 2598 LLVMValueRef y) 2599{ 2600 LLVMBuilderRef builder = bld->gallivm->builder; 2601 LLVMValueRef res; 2602 const struct lp_type type = bld->type; 2603 2604 assert(lp_check_value(type, x)); 2605 assert(lp_check_value(type, y)); 2606 2607 if (type.floating) 2608 res = LLVMBuildFRem(builder, x, y, ""); 2609 else if (type.sign) 2610 res = LLVMBuildSRem(builder, x, y, ""); 2611 else 2612 res = LLVMBuildURem(builder, x, y, ""); 2613 return res; 2614} 2615