lp_bld_arit.c revision dbadd395082d1c812733dff8c873f475d82c63e4
1cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com/************************************************************************** 2cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * 3cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * Copyright 2009-2010 VMware, Inc. 4cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * All Rights Reserved. 5cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * 6cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * Permission is hereby granted, free of charge, to any person obtaining a 7cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * copy of this software and associated documentation files (the 8cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * "Software"), to deal in the Software without restriction, including 9cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * without limitation the rights to use, copy, modify, merge, publish, 108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * distribute, sub license, and/or sell copies of the Software, and to 118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * permit persons to whom the Software is furnished to do so, subject to 128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * the following conditions: 138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * The above copyright notice and this permission notice (including the 158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * next paragraph) shall be included in all copies or substantial portions 168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * of the Software. 178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com **************************************************************************/ 278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @file 318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Helper 328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * LLVM IR doesn't support all basic arithmetic operations we care about (most 348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * notably min/max and saturated operations), and it is often necessary to 358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * resort machine-specific intrinsics directly. The functions here hide all 368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * these implementation details from the other modules. 378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * We also do simple expressions simplification here. Reasons are: 398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - it is very easy given we have all necessary information readily available 408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - LLVM optimization passes fail to simplify several vector expressions 418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - We often know value constraints which the optimization passes have no way 428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * of knowing, such as when source arguments are known to be in [0, 1] range. 438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @author Jose Fonseca <jfonseca@vmware.com> 458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_memory.h" 498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_debug.h" 508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_math.h" 518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_string.h" 528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_cpu_detect.h" 538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_type.h" 558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_const.h" 568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_init.h" 578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_intr.h" 588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_logic.h" 598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_pack.h" 608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_debug.h" 618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_arit.h" 628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#define EXP_POLY_DEGREE 5 658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#define LOG_POLY_DEGREE 5 678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate min(a, b) 718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * No checks for special case values of a or b = 1 or 0 are done. 728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comstatic LLVMValueRef 748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_min_simple(struct lp_build_context *bld, 758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a, 768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef b) 778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const char *intrinsic = NULL; 818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef cond; 828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, b)); 858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* TODO: optimize the constant case */ 878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width * type.length == 128) { 898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.floating) { 908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 32 && util_cpu_caps.has_sse) 918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse.min.ps"; 928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 64 && util_cpu_caps.has_sse2) 938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse2.min.pd"; 948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else { 968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse2.pminu.b"; 988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pminsb"; 1008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 1018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pminuw"; 1028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 1038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse2.pmins.w"; 1048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 1058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pminud"; 1068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 1078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pminsd"; 1088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 1098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 1108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(intrinsic) 1128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 1138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 1158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return lp_build_select(bld, cond, a, b); 1168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 1178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 1208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate max(a, b) 1218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * No checks for special case values of a or b = 1 or 0 are done. 1228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 1238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comstatic LLVMValueRef 1248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_max_simple(struct lp_build_context *bld, 1258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a, 1268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef b) 1278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 1288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 1298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 1308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const char *intrinsic = NULL; 1318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef cond; 1328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 1348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, b)); 1358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* TODO: optimize the constant case */ 1378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width * type.length == 128) { 1398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.floating) { 1408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 32 && util_cpu_caps.has_sse) 1418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse.max.ps"; 1428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 64 && util_cpu_caps.has_sse2) 1438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse2.max.pd"; 1448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 1458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else { 1468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) 1478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse2.pmaxu.b"; 1488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) 1498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pmaxsb"; 1508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) 1518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pmaxuw"; 1528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) 1538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse2.pmaxs.w"; 1548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) 1558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pmaxud"; 1568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) 1578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = "llvm.x86.sse41.pmaxsd"; 1588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 1598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 1608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(intrinsic) 1628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 1638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 1658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return lp_build_select(bld, cond, a, b); 1668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 1678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 1708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate 1 - a, or ~a depending on bld->type. 1718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 1728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef 1738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_comp(struct lp_build_context *bld, 1748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a) 1758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 1768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 1778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 1788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 1808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->one) 1828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->zero; 1838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->zero) 1848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->one; 1858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.norm && !type.floating && !type.fixed && !type.sign) { 1878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(LLVMIsConstant(a)) 1888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return LLVMConstNot(a); 1898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 1908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return LLVMBuildNot(builder, a, ""); 1918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 1928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 1938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(LLVMIsConstant(a)) 1948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 1958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return LLVMConstFSub(bld->one, a); 1968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 1978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return LLVMConstSub(bld->one, a); 1988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 1998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 2008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return LLVMBuildFSub(builder, bld->one, a, ""); 2018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 2028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return LLVMBuildSub(builder, bld->one, a, ""); 2038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 2048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 2078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate a + b 2088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 2098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef 2108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_add(struct lp_build_context *bld, 2118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a, 2128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef b) 2138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 2148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 2158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 2168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef res; 2178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 2198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, b)); 2208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->zero) 2228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return b; 2238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(b == bld->zero) 2248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return a; 2258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->undef || b == bld->undef) 2268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->undef; 2278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(bld->type.norm) { 2298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const char *intrinsic = NULL; 2308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->one || b == bld->one) 2328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->one; 2338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(util_cpu_caps.has_sse2 && 2358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com type.width * type.length == 128 && 2368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com !type.floating && !type.fixed) { 2378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8) 2388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 2398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 16) 2408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 2418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 2428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(intrinsic) 2448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 2458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 2468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(LLVMIsConstant(a) && LLVMIsConstant(b)) 2488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 2498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstFAdd(a, b); 2508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 2518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstAdd(a, b); 2528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 2538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 2548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildFAdd(builder, a, b, ""); 2558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 2568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildAdd(builder, a, b, ""); 2578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* clamp to ceiling of 1.0 */ 2598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 2608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = lp_build_min_simple(bld, res, bld->one); 2618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* XXX clamp to floor of -1 or 0??? */ 2638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return res; 2658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 2668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** Return the scalar sum of the elements of a */ 2698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef 2708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_sum_vector(struct lp_build_context *bld, 2718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a) 2728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 2738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 2748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 2758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef index, res; 2768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com unsigned i; 2778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 2798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.length == 1) { 2818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return a; 2828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 2838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(!bld->type.norm); 2858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com index = lp_build_const_int32(bld->gallivm, 0); 2878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildExtractElement(builder, a, index, ""); 2888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 2898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com for (i = 1; i < type.length; i++) { 2908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com index = lp_build_const_int32(bld->gallivm, i); 2918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 2928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildFAdd(builder, res, 2938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuildExtractElement(builder, 2948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com a, index, ""), 2958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ""); 2968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 2978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildAdd(builder, res, 2988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuildExtractElement(builder, 2998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com a, index, ""), 3008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ""); 3018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 3028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return res; 3048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 3058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 3088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate a - b 3098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 3108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef 3118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_sub(struct lp_build_context *bld, 3128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a, 3138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef b) 3148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 3158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 3168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 3178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef res; 3188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 3208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, b)); 3218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(b == bld->zero) 3238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return a; 3248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->undef || b == bld->undef) 3258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->undef; 3268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == b) 3278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->zero; 3288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(bld->type.norm) { 3308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const char *intrinsic = NULL; 3318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(b == bld->one) 3338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->zero; 3348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(util_cpu_caps.has_sse2 && 3368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com type.width * type.length == 128 && 3378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com !type.floating && !type.fixed) { 3388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8) 3398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 3408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 16) 3418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 3428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 3438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(intrinsic) 3458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 3468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 3478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(LLVMIsConstant(a) && LLVMIsConstant(b)) 3498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 3508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstFSub(a, b); 3518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 3528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstSub(a, b); 3538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 3548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 3558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildFSub(builder, a, b, ""); 3568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 3578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildSub(builder, a, b, ""); 3588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 3608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = lp_build_max_simple(bld, res, bld->zero); 3618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return res; 3638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 3648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 3668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 3678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Normalized 8bit multiplication. 3688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - alpha plus one 3708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * makes the following approximation to the division (Sree) 3728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * a*b/255 ~= (a*(b + 1)) >> 256 3748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * which is the fastest method that satisfies the following OpenGL criteria 3768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 0*0 = 0 and 255*255 = 255 3788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - geometric series 3808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * takes the geometric series approximation to the division 3828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 3848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * in this case just the first two terms to fit in 16bit arithmetic 3868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * t/255 ~= (t + (t >> 8)) >> 8 3888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * note that just by itself it doesn't satisfies the OpenGL criteria, as 3908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 3918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * must be used 3928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - geometric series plus rounding 3948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * when using a geometric series division instead of truncating the result 3968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * use roundoff in the approximation (Jim Blinn) 3978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 3988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * t/255 ~= (t + (t >> 8) + 0x80) >> 8 3998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 4008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * achieving the exact results 4018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * 4028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 4038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 4048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @sa Michael Herf, The "double blend trick", May 2000, 4058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * http://www.stereopsis.com/doubleblend.html 4068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 4078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comstatic LLVMValueRef 4088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_mul_u8n(struct gallivm_state *gallivm, 4098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com struct lp_type i16_type, 4108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a, LLVMValueRef b) 4118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 4128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = gallivm->builder; 4138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef c8; 4148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef ab; 4158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(!i16_type.floating); 4178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(i16_type, a)); 4188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(i16_type, b)); 4198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com c8 = lp_build_const_int_vec(gallivm, i16_type, 8); 4218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#if 0 4238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* a*b/255 ~= (a*(b + 1)) >> 256 */ 4258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), ""); 4268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ab = LLVMBuildMul(builder, a, b, ""); 4278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#else 4298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 4318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ab = LLVMBuildMul(builder, a, b, ""); 4328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 4338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), ""); 4348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#endif 4368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ab = LLVMBuildLShr(builder, ab, c8, ""); 4388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return ab; 4408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 4418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 4448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate a * b 4458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 4468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef 4478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_mul(struct lp_build_context *bld, 4488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef a, 4498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef b) 4508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{ 4518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMBuilderRef builder = bld->gallivm->builder; 4528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com const struct lp_type type = bld->type; 4538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef shift; 4548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef res; 4558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, a)); 4578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(lp_check_value(type, b)); 4588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->zero) 4608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->zero; 4618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->one) 4628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return b; 4638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(b == bld->zero) 4648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->zero; 4658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(b == bld->one) 4668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return a; 4678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(a == bld->undef || b == bld->undef) 4688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return bld->undef; 4698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(!type.floating && !type.fixed && type.norm) { 4718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.width == 8) { 4728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com struct lp_type i16_type = lp_wider_type(type); 4738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com LLVMValueRef al, ah, bl, bh, abl, abh, ab; 4748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah); 4768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh); 4778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* PMULLW, PSRLW, PADDW */ 4798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl); 4808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh); 4818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh); 4838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return ab; 4858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 4868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com /* FIXME */ 4888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com assert(0); 4898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 4908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.fixed) 4928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 4938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 4948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com shift = NULL; 4958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 4968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 4978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 4988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstFMul(a, b); 4998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 5008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstMul(a, b); 5018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(shift) { 5028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.sign) 5038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstAShr(res, shift); 5048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 5058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMConstLShr(res, shift); 5068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 5078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 5088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else { 5098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if (type.floating) 5108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildFMul(builder, a, b, ""); 5118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 5128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildMul(builder, a, b, ""); 5138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(shift) { 5148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com if(type.sign) 5158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildAShr(builder, res, shift, ""); 5168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com else 5178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com res = LLVMBuildLShr(builder, res, shift, ""); 5188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 5198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com } 5208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 5218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com return res; 5228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com} 5238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 5248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com 5258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** 5268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Small vector x scale multiplication optimization. 5278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */ 5288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef 529cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.comlp_build_mul_imm(struct lp_build_context *bld, 530 LLVMValueRef a, 531 int b) 532{ 533 LLVMBuilderRef builder = bld->gallivm->builder; 534 LLVMValueRef factor; 535 536 assert(lp_check_value(bld->type, a)); 537 538 if(b == 0) 539 return bld->zero; 540 541 if(b == 1) 542 return a; 543 544 if(b == -1) 545 return lp_build_negate(bld, a); 546 547 if(b == 2 && bld->type.floating) 548 return lp_build_add(bld, a, a); 549 550 if(util_is_power_of_two(b)) { 551 unsigned shift = ffs(b) - 1; 552 553 if(bld->type.floating) { 554#if 0 555 /* 556 * Power of two multiplication by directly manipulating the mantissa. 557 * 558 * XXX: This might not be always faster, it will introduce a small error 559 * for multiplication by zero, and it will produce wrong results 560 * for Inf and NaN. 561 */ 562 unsigned mantissa = lp_mantissa(bld->type); 563 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 564 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 565 a = LLVMBuildAdd(builder, a, factor, ""); 566 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 567 return a; 568#endif 569 } 570 else { 571 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 572 return LLVMBuildShl(builder, a, factor, ""); 573 } 574 } 575 576 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 577 return lp_build_mul(bld, a, factor); 578} 579 580 581/** 582 * Generate a / b 583 */ 584LLVMValueRef 585lp_build_div(struct lp_build_context *bld, 586 LLVMValueRef a, 587 LLVMValueRef b) 588{ 589 LLVMBuilderRef builder = bld->gallivm->builder; 590 const struct lp_type type = bld->type; 591 592 assert(lp_check_value(type, a)); 593 assert(lp_check_value(type, b)); 594 595 if(a == bld->zero) 596 return bld->zero; 597 if(a == bld->one) 598 return lp_build_rcp(bld, b); 599 if(b == bld->zero) 600 return bld->undef; 601 if(b == bld->one) 602 return a; 603 if(a == bld->undef || b == bld->undef) 604 return bld->undef; 605 606 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 607 if (type.floating) 608 return LLVMConstFDiv(a, b); 609 else if (type.sign) 610 return LLVMConstSDiv(a, b); 611 else 612 return LLVMConstUDiv(a, b); 613 } 614 615 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) 616 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 617 618 if (type.floating) 619 return LLVMBuildFDiv(builder, a, b, ""); 620 else if (type.sign) 621 return LLVMBuildSDiv(builder, a, b, ""); 622 else 623 return LLVMBuildUDiv(builder, a, b, ""); 624} 625 626 627/** 628 * Linear interpolation -- without any checks. 629 * 630 * @sa http://www.stereopsis.com/doubleblend.html 631 */ 632static INLINE LLVMValueRef 633lp_build_lerp_simple(struct lp_build_context *bld, 634 LLVMValueRef x, 635 LLVMValueRef v0, 636 LLVMValueRef v1) 637{ 638 LLVMBuilderRef builder = bld->gallivm->builder; 639 LLVMValueRef delta; 640 LLVMValueRef res; 641 642 assert(lp_check_value(bld->type, x)); 643 assert(lp_check_value(bld->type, v0)); 644 assert(lp_check_value(bld->type, v1)); 645 646 delta = lp_build_sub(bld, v1, v0); 647 648 res = lp_build_mul(bld, x, delta); 649 650 res = lp_build_add(bld, v0, res); 651 652 if (bld->type.fixed) { 653 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 654 * but it will be wrong for other uses. Basically we need a more 655 * powerful lp_type, capable of further distinguishing the values 656 * interpretation from the value storage. */ 657 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), ""); 658 } 659 660 return res; 661} 662 663 664/** 665 * Linear interpolation. 666 */ 667LLVMValueRef 668lp_build_lerp(struct lp_build_context *bld, 669 LLVMValueRef x, 670 LLVMValueRef v0, 671 LLVMValueRef v1) 672{ 673 LLVMBuilderRef builder = bld->gallivm->builder; 674 const struct lp_type type = bld->type; 675 LLVMValueRef res; 676 677 assert(lp_check_value(type, x)); 678 assert(lp_check_value(type, v0)); 679 assert(lp_check_value(type, v1)); 680 681 if (type.norm) { 682 struct lp_type wide_type; 683 struct lp_build_context wide_bld; 684 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 685 LLVMValueRef shift; 686 687 assert(type.length >= 2); 688 assert(!type.sign); 689 690 /* 691 * Create a wider type, enough to hold the intermediate result of the 692 * multiplication. 693 */ 694 memset(&wide_type, 0, sizeof wide_type); 695 wide_type.fixed = TRUE; 696 wide_type.width = type.width*2; 697 wide_type.length = type.length/2; 698 699 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 700 701 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); 702 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 703 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 704 705 /* 706 * Scale x from [0, 255] to [0, 256] 707 */ 708 709 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1); 710 711 xl = lp_build_add(&wide_bld, xl, 712 LLVMBuildAShr(builder, xl, shift, "")); 713 xh = lp_build_add(&wide_bld, xh, 714 LLVMBuildAShr(builder, xh, shift, "")); 715 716 /* 717 * Lerp both halves. 718 */ 719 720 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); 721 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); 722 723 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); 724 } else { 725 res = lp_build_lerp_simple(bld, x, v0, v1); 726 } 727 728 return res; 729} 730 731 732LLVMValueRef 733lp_build_lerp_2d(struct lp_build_context *bld, 734 LLVMValueRef x, 735 LLVMValueRef y, 736 LLVMValueRef v00, 737 LLVMValueRef v01, 738 LLVMValueRef v10, 739 LLVMValueRef v11) 740{ 741 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 742 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 743 return lp_build_lerp(bld, y, v0, v1); 744} 745 746 747/** 748 * Generate min(a, b) 749 * Do checks for special cases. 750 */ 751LLVMValueRef 752lp_build_min(struct lp_build_context *bld, 753 LLVMValueRef a, 754 LLVMValueRef b) 755{ 756 assert(lp_check_value(bld->type, a)); 757 assert(lp_check_value(bld->type, b)); 758 759 if(a == bld->undef || b == bld->undef) 760 return bld->undef; 761 762 if(a == b) 763 return a; 764 765 if(bld->type.norm) { 766 if(a == bld->zero || b == bld->zero) 767 return bld->zero; 768 if(a == bld->one) 769 return b; 770 if(b == bld->one) 771 return a; 772 } 773 774 return lp_build_min_simple(bld, a, b); 775} 776 777 778/** 779 * Generate max(a, b) 780 * Do checks for special cases. 781 */ 782LLVMValueRef 783lp_build_max(struct lp_build_context *bld, 784 LLVMValueRef a, 785 LLVMValueRef b) 786{ 787 assert(lp_check_value(bld->type, a)); 788 assert(lp_check_value(bld->type, b)); 789 790 if(a == bld->undef || b == bld->undef) 791 return bld->undef; 792 793 if(a == b) 794 return a; 795 796 if(bld->type.norm) { 797 if(a == bld->one || b == bld->one) 798 return bld->one; 799 if(a == bld->zero) 800 return b; 801 if(b == bld->zero) 802 return a; 803 } 804 805 return lp_build_max_simple(bld, a, b); 806} 807 808 809/** 810 * Generate clamp(a, min, max) 811 * Do checks for special cases. 812 */ 813LLVMValueRef 814lp_build_clamp(struct lp_build_context *bld, 815 LLVMValueRef a, 816 LLVMValueRef min, 817 LLVMValueRef max) 818{ 819 assert(lp_check_value(bld->type, a)); 820 assert(lp_check_value(bld->type, min)); 821 assert(lp_check_value(bld->type, max)); 822 823 a = lp_build_min(bld, a, max); 824 a = lp_build_max(bld, a, min); 825 return a; 826} 827 828 829/** 830 * Generate abs(a) 831 */ 832LLVMValueRef 833lp_build_abs(struct lp_build_context *bld, 834 LLVMValueRef a) 835{ 836 LLVMBuilderRef builder = bld->gallivm->builder; 837 const struct lp_type type = bld->type; 838 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 839 840 assert(lp_check_value(type, a)); 841 842 if(!type.sign) 843 return a; 844 845 if(type.floating) { 846 /* Mask out the sign bit */ 847 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 848 unsigned long long absMask = ~(1ULL << (type.width - 1)); 849 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 850 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 851 a = LLVMBuildAnd(builder, a, mask, ""); 852 a = LLVMBuildBitCast(builder, a, vec_type, ""); 853 return a; 854 } 855 856 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 857 switch(type.width) { 858 case 8: 859 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 860 case 16: 861 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 862 case 32: 863 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 864 } 865 } 866 867 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 868} 869 870 871LLVMValueRef 872lp_build_negate(struct lp_build_context *bld, 873 LLVMValueRef a) 874{ 875 LLVMBuilderRef builder = bld->gallivm->builder; 876 877 assert(lp_check_value(bld->type, a)); 878 879#if HAVE_LLVM >= 0x0207 880 if (bld->type.floating) 881 a = LLVMBuildFNeg(builder, a, ""); 882 else 883#endif 884 a = LLVMBuildNeg(builder, a, ""); 885 886 return a; 887} 888 889 890/** Return -1, 0 or +1 depending on the sign of a */ 891LLVMValueRef 892lp_build_sgn(struct lp_build_context *bld, 893 LLVMValueRef a) 894{ 895 LLVMBuilderRef builder = bld->gallivm->builder; 896 const struct lp_type type = bld->type; 897 LLVMValueRef cond; 898 LLVMValueRef res; 899 900 assert(lp_check_value(type, a)); 901 902 /* Handle non-zero case */ 903 if(!type.sign) { 904 /* if not zero then sign must be positive */ 905 res = bld->one; 906 } 907 else if(type.floating) { 908 LLVMTypeRef vec_type; 909 LLVMTypeRef int_type; 910 LLVMValueRef mask; 911 LLVMValueRef sign; 912 LLVMValueRef one; 913 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 914 915 int_type = lp_build_int_vec_type(bld->gallivm, type); 916 vec_type = lp_build_vec_type(bld->gallivm, type); 917 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 918 919 /* Take the sign bit and add it to 1 constant */ 920 sign = LLVMBuildBitCast(builder, a, int_type, ""); 921 sign = LLVMBuildAnd(builder, sign, mask, ""); 922 one = LLVMConstBitCast(bld->one, int_type); 923 res = LLVMBuildOr(builder, sign, one, ""); 924 res = LLVMBuildBitCast(builder, res, vec_type, ""); 925 } 926 else 927 { 928 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 929 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 930 res = lp_build_select(bld, cond, bld->one, minus_one); 931 } 932 933 /* Handle zero */ 934 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 935 res = lp_build_select(bld, cond, bld->zero, res); 936 937 return res; 938} 939 940 941/** 942 * Set the sign of float vector 'a' according to 'sign'. 943 * If sign==0, return abs(a). 944 * If sign==1, return -abs(a); 945 * Other values for sign produce undefined results. 946 */ 947LLVMValueRef 948lp_build_set_sign(struct lp_build_context *bld, 949 LLVMValueRef a, LLVMValueRef sign) 950{ 951 LLVMBuilderRef builder = bld->gallivm->builder; 952 const struct lp_type type = bld->type; 953 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 954 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 955 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 956 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 957 ~((unsigned long long) 1 << (type.width - 1))); 958 LLVMValueRef val, res; 959 960 assert(type.floating); 961 assert(lp_check_value(type, a)); 962 963 /* val = reinterpret_cast<int>(a) */ 964 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 965 /* val = val & mask */ 966 val = LLVMBuildAnd(builder, val, mask, ""); 967 /* sign = sign << shift */ 968 sign = LLVMBuildShl(builder, sign, shift, ""); 969 /* res = val | sign */ 970 res = LLVMBuildOr(builder, val, sign, ""); 971 /* res = reinterpret_cast<float>(res) */ 972 res = LLVMBuildBitCast(builder, res, vec_type, ""); 973 974 return res; 975} 976 977 978/** 979 * Convert vector of (or scalar) int to vector of (or scalar) float. 980 */ 981LLVMValueRef 982lp_build_int_to_float(struct lp_build_context *bld, 983 LLVMValueRef a) 984{ 985 LLVMBuilderRef builder = bld->gallivm->builder; 986 const struct lp_type type = bld->type; 987 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 988 989 assert(type.floating); 990 991 return LLVMBuildSIToFP(builder, a, vec_type, ""); 992} 993 994 995 996enum lp_build_round_sse41_mode 997{ 998 LP_BUILD_ROUND_SSE41_NEAREST = 0, 999 LP_BUILD_ROUND_SSE41_FLOOR = 1, 1000 LP_BUILD_ROUND_SSE41_CEIL = 2, 1001 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 1002}; 1003 1004 1005/** 1006 * Helper for SSE4.1's ROUNDxx instructions. 1007 * 1008 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the 1009 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. 1010 */ 1011static INLINE LLVMValueRef 1012lp_build_round_sse41(struct lp_build_context *bld, 1013 LLVMValueRef a, 1014 enum lp_build_round_sse41_mode mode) 1015{ 1016 LLVMBuilderRef builder = bld->gallivm->builder; 1017 const struct lp_type type = bld->type; 1018 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1019 const char *intrinsic; 1020 LLVMValueRef res; 1021 1022 assert(type.floating); 1023 1024 assert(lp_check_value(type, a)); 1025 assert(util_cpu_caps.has_sse4_1); 1026 1027 if (type.length == 1) { 1028 LLVMTypeRef vec_type; 1029 LLVMValueRef undef; 1030 LLVMValueRef args[3]; 1031 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1032 1033 switch(type.width) { 1034 case 32: 1035 intrinsic = "llvm.x86.sse41.round.ss"; 1036 break; 1037 case 64: 1038 intrinsic = "llvm.x86.sse41.round.sd"; 1039 break; 1040 default: 1041 assert(0); 1042 return bld->undef; 1043 } 1044 1045 vec_type = LLVMVectorType(bld->elem_type, 4); 1046 1047 undef = LLVMGetUndef(vec_type); 1048 1049 args[0] = undef; 1050 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1051 args[2] = LLVMConstInt(i32t, mode, 0); 1052 1053 res = lp_build_intrinsic(builder, intrinsic, 1054 vec_type, args, Elements(args)); 1055 1056 res = LLVMBuildExtractElement(builder, res, index0, ""); 1057 } 1058 else { 1059 assert(type.width*type.length == 128); 1060 1061 switch(type.width) { 1062 case 32: 1063 intrinsic = "llvm.x86.sse41.round.ps"; 1064 break; 1065 case 64: 1066 intrinsic = "llvm.x86.sse41.round.pd"; 1067 break; 1068 default: 1069 assert(0); 1070 return bld->undef; 1071 } 1072 1073 res = lp_build_intrinsic_binary(builder, intrinsic, 1074 bld->vec_type, a, 1075 LLVMConstInt(i32t, mode, 0)); 1076 } 1077 1078 return res; 1079} 1080 1081 1082static INLINE LLVMValueRef 1083lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1084 LLVMValueRef a) 1085{ 1086 LLVMBuilderRef builder = bld->gallivm->builder; 1087 const struct lp_type type = bld->type; 1088 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1089 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1090 const char *intrinsic; 1091 LLVMValueRef res; 1092 1093 assert(type.floating); 1094 /* using the double precision conversions is a bit more complicated */ 1095 assert(type.width == 32); 1096 1097 assert(lp_check_value(type, a)); 1098 assert(util_cpu_caps.has_sse2); 1099 1100 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1101 if (type.length == 1) { 1102 LLVMTypeRef vec_type; 1103 LLVMValueRef undef; 1104 LLVMValueRef arg; 1105 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1106 1107 vec_type = LLVMVectorType(bld->elem_type, 4); 1108 1109 intrinsic = "llvm.x86.sse.cvtss2si"; 1110 1111 undef = LLVMGetUndef(vec_type); 1112 1113 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1114 1115 res = lp_build_intrinsic_unary(builder, intrinsic, 1116 ret_type, arg); 1117 } 1118 else { 1119 assert(type.width*type.length == 128); 1120 1121 intrinsic = "llvm.x86.sse2.cvtps2dq"; 1122 1123 res = lp_build_intrinsic_unary(builder, intrinsic, 1124 ret_type, a); 1125 } 1126 1127 return res; 1128} 1129 1130 1131/** 1132 * Return the integer part of a float (vector) value (== round toward zero). 1133 * The returned value is a float (vector). 1134 * Ex: trunc(-1.5) = -1.0 1135 */ 1136LLVMValueRef 1137lp_build_trunc(struct lp_build_context *bld, 1138 LLVMValueRef a) 1139{ 1140 LLVMBuilderRef builder = bld->gallivm->builder; 1141 const struct lp_type type = bld->type; 1142 1143 assert(type.floating); 1144 assert(lp_check_value(type, a)); 1145 1146 if (util_cpu_caps.has_sse4_1 && 1147 (type.length == 1 || type.width*type.length == 128)) { 1148 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 1149 } 1150 else { 1151 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1152 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1153 LLVMValueRef res; 1154 res = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1155 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1156 return res; 1157 } 1158} 1159 1160 1161/** 1162 * Return float (vector) rounded to nearest integer (vector). The returned 1163 * value is a float (vector). 1164 * Ex: round(0.9) = 1.0 1165 * Ex: round(-1.5) = -2.0 1166 */ 1167LLVMValueRef 1168lp_build_round(struct lp_build_context *bld, 1169 LLVMValueRef a) 1170{ 1171 LLVMBuilderRef builder = bld->gallivm->builder; 1172 const struct lp_type type = bld->type; 1173 1174 assert(type.floating); 1175 assert(lp_check_value(type, a)); 1176 1177 if (util_cpu_caps.has_sse4_1 && 1178 (type.length == 1 || type.width*type.length == 128)) { 1179 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1180 } 1181 else { 1182 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1183 LLVMValueRef res; 1184 res = lp_build_iround(bld, a); 1185 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1186 return res; 1187 } 1188} 1189 1190 1191/** 1192 * Return floor of float (vector), result is a float (vector) 1193 * Ex: floor(1.1) = 1.0 1194 * Ex: floor(-1.1) = -2.0 1195 */ 1196LLVMValueRef 1197lp_build_floor(struct lp_build_context *bld, 1198 LLVMValueRef a) 1199{ 1200 LLVMBuilderRef builder = bld->gallivm->builder; 1201 const struct lp_type type = bld->type; 1202 1203 assert(type.floating); 1204 assert(lp_check_value(type, a)); 1205 1206 if (util_cpu_caps.has_sse4_1 && 1207 (type.length == 1 || type.width*type.length == 128)) { 1208 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1209 } 1210 else { 1211 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1212 LLVMValueRef res; 1213 res = lp_build_ifloor(bld, a); 1214 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1215 return res; 1216 } 1217} 1218 1219 1220/** 1221 * Return ceiling of float (vector), returning float (vector). 1222 * Ex: ceil( 1.1) = 2.0 1223 * Ex: ceil(-1.1) = -1.0 1224 */ 1225LLVMValueRef 1226lp_build_ceil(struct lp_build_context *bld, 1227 LLVMValueRef a) 1228{ 1229 LLVMBuilderRef builder = bld->gallivm->builder; 1230 const struct lp_type type = bld->type; 1231 1232 assert(type.floating); 1233 assert(lp_check_value(type, a)); 1234 1235 if (util_cpu_caps.has_sse4_1 && 1236 (type.length == 1 || type.width*type.length == 128)) { 1237 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1238 } 1239 else { 1240 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1241 LLVMValueRef res; 1242 res = lp_build_iceil(bld, a); 1243 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1244 return res; 1245 } 1246} 1247 1248 1249/** 1250 * Return fractional part of 'a' computed as a - floor(a) 1251 * Typically used in texture coord arithmetic. 1252 */ 1253LLVMValueRef 1254lp_build_fract(struct lp_build_context *bld, 1255 LLVMValueRef a) 1256{ 1257 assert(bld->type.floating); 1258 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1259} 1260 1261 1262/** 1263 * Return the integer part of a float (vector) value (== round toward zero). 1264 * The returned value is an integer (vector). 1265 * Ex: itrunc(-1.5) = -1 1266 */ 1267LLVMValueRef 1268lp_build_itrunc(struct lp_build_context *bld, 1269 LLVMValueRef a) 1270{ 1271 LLVMBuilderRef builder = bld->gallivm->builder; 1272 const struct lp_type type = bld->type; 1273 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1274 1275 assert(type.floating); 1276 assert(lp_check_value(type, a)); 1277 1278 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1279} 1280 1281 1282/** 1283 * Return float (vector) rounded to nearest integer (vector). The returned 1284 * value is an integer (vector). 1285 * Ex: iround(0.9) = 1 1286 * Ex: iround(-1.5) = -2 1287 */ 1288LLVMValueRef 1289lp_build_iround(struct lp_build_context *bld, 1290 LLVMValueRef a) 1291{ 1292 LLVMBuilderRef builder = bld->gallivm->builder; 1293 const struct lp_type type = bld->type; 1294 LLVMTypeRef int_vec_type = bld->int_vec_type; 1295 LLVMValueRef res; 1296 1297 assert(type.floating); 1298 1299 assert(lp_check_value(type, a)); 1300 1301 if (util_cpu_caps.has_sse2 && 1302 ((type.width == 32) && (type.length == 1 || type.length == 4))) { 1303 return lp_build_iround_nearest_sse2(bld, a); 1304 } 1305 else if (util_cpu_caps.has_sse4_1 && 1306 (type.length == 1 || type.width*type.length == 128)) { 1307 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1308 } 1309 else { 1310 LLVMValueRef half; 1311 1312 half = lp_build_const_vec(bld->gallivm, type, 0.5); 1313 1314 if (type.sign) { 1315 LLVMTypeRef vec_type = bld->vec_type; 1316 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1317 (unsigned long long)1 << (type.width - 1)); 1318 LLVMValueRef sign; 1319 1320 /* get sign bit */ 1321 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1322 sign = LLVMBuildAnd(builder, sign, mask, ""); 1323 1324 /* sign * 0.5 */ 1325 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 1326 half = LLVMBuildOr(builder, sign, half, ""); 1327 half = LLVMBuildBitCast(builder, half, vec_type, ""); 1328 } 1329 1330 res = LLVMBuildFAdd(builder, a, half, ""); 1331 } 1332 1333 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 1334 1335 return res; 1336} 1337 1338 1339/** 1340 * Return floor of float (vector), result is an int (vector) 1341 * Ex: ifloor(1.1) = 1.0 1342 * Ex: ifloor(-1.1) = -2.0 1343 */ 1344LLVMValueRef 1345lp_build_ifloor(struct lp_build_context *bld, 1346 LLVMValueRef a) 1347{ 1348 LLVMBuilderRef builder = bld->gallivm->builder; 1349 const struct lp_type type = bld->type; 1350 LLVMTypeRef int_vec_type = bld->int_vec_type; 1351 LLVMValueRef res; 1352 1353 assert(type.floating); 1354 assert(lp_check_value(type, a)); 1355 1356 if (util_cpu_caps.has_sse4_1 && 1357 (type.length == 1 || type.width*type.length == 128)) { 1358 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1359 } 1360 else { 1361 res = a; 1362 1363 if (type.sign) { 1364 /* Take the sign bit and add it to 1 constant */ 1365 LLVMTypeRef vec_type = bld->vec_type; 1366 unsigned mantissa = lp_mantissa(type); 1367 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1368 (unsigned long long)1 << (type.width - 1)); 1369 LLVMValueRef sign; 1370 LLVMValueRef offset; 1371 1372 /* sign = a < 0 ? ~0 : 0 */ 1373 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1374 sign = LLVMBuildAnd(builder, sign, mask, ""); 1375 sign = LLVMBuildAShr(builder, sign, 1376 lp_build_const_int_vec(bld->gallivm, type, 1377 type.width - 1), 1378 "ifloor.sign"); 1379 1380 /* offset = -0.99999(9)f */ 1381 offset = lp_build_const_vec(bld->gallivm, type, 1382 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1383 offset = LLVMConstBitCast(offset, int_vec_type); 1384 1385 /* offset = a < 0 ? offset : 0.0f */ 1386 offset = LLVMBuildAnd(builder, offset, sign, ""); 1387 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset"); 1388 1389 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res"); 1390 } 1391 } 1392 1393 /* round to nearest (toward zero) */ 1394 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 1395 1396 return res; 1397} 1398 1399 1400/** 1401 * Return ceiling of float (vector), returning int (vector). 1402 * Ex: iceil( 1.1) = 2 1403 * Ex: iceil(-1.1) = -1 1404 */ 1405LLVMValueRef 1406lp_build_iceil(struct lp_build_context *bld, 1407 LLVMValueRef a) 1408{ 1409 LLVMBuilderRef builder = bld->gallivm->builder; 1410 const struct lp_type type = bld->type; 1411 LLVMTypeRef int_vec_type = bld->int_vec_type; 1412 LLVMValueRef res; 1413 1414 assert(type.floating); 1415 assert(lp_check_value(type, a)); 1416 1417 if (util_cpu_caps.has_sse4_1 && 1418 (type.length == 1 || type.width*type.length == 128)) { 1419 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1420 } 1421 else { 1422 LLVMTypeRef vec_type = bld->vec_type; 1423 unsigned mantissa = lp_mantissa(type); 1424 LLVMValueRef offset; 1425 1426 /* offset = 0.99999(9)f */ 1427 offset = lp_build_const_vec(bld->gallivm, type, 1428 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1429 1430 if (type.sign) { 1431 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1432 (unsigned long long)1 << (type.width - 1)); 1433 LLVMValueRef sign; 1434 1435 /* sign = a < 0 ? 0 : ~0 */ 1436 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1437 sign = LLVMBuildAnd(builder, sign, mask, ""); 1438 sign = LLVMBuildAShr(builder, sign, 1439 lp_build_const_int_vec(bld->gallivm, type, 1440 type.width - 1), 1441 "iceil.sign"); 1442 sign = LLVMBuildNot(builder, sign, "iceil.not"); 1443 1444 /* offset = a < 0 ? 0.0 : offset */ 1445 offset = LLVMConstBitCast(offset, int_vec_type); 1446 offset = LLVMBuildAnd(builder, offset, sign, ""); 1447 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset"); 1448 } 1449 1450 res = LLVMBuildFAdd(builder, a, offset, "iceil.res"); 1451 } 1452 1453 /* round to nearest (toward zero) */ 1454 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 1455 1456 return res; 1457} 1458 1459 1460/** 1461 * Combined ifloor() & fract(). 1462 * 1463 * Preferred to calling the functions separately, as it will ensure that the 1464 * stratergy (floor() vs ifloor()) that results in less redundant work is used. 1465 */ 1466void 1467lp_build_ifloor_fract(struct lp_build_context *bld, 1468 LLVMValueRef a, 1469 LLVMValueRef *out_ipart, 1470 LLVMValueRef *out_fpart) 1471{ 1472 LLVMBuilderRef builder = bld->gallivm->builder; 1473 const struct lp_type type = bld->type; 1474 LLVMValueRef ipart; 1475 1476 assert(type.floating); 1477 assert(lp_check_value(type, a)); 1478 1479 if (util_cpu_caps.has_sse4_1 && 1480 (type.length == 1 || type.width*type.length == 128)) { 1481 /* 1482 * floor() is easier. 1483 */ 1484 1485 ipart = lp_build_floor(bld, a); 1486 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1487 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 1488 } 1489 else { 1490 /* 1491 * ifloor() is easier. 1492 */ 1493 1494 *out_ipart = lp_build_ifloor(bld, a); 1495 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 1496 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1497 } 1498} 1499 1500 1501LLVMValueRef 1502lp_build_sqrt(struct lp_build_context *bld, 1503 LLVMValueRef a) 1504{ 1505 LLVMBuilderRef builder = bld->gallivm->builder; 1506 const struct lp_type type = bld->type; 1507 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1508 char intrinsic[32]; 1509 1510 assert(lp_check_value(type, a)); 1511 1512 /* TODO: optimize the constant case */ 1513 /* TODO: optimize the constant case */ 1514 1515 assert(type.floating); 1516 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1517 1518 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1519} 1520 1521 1522/** 1523 * Do one Newton-Raphson step to improve reciprocate precision: 1524 * 1525 * x_{i+1} = x_i * (2 - a * x_i) 1526 * 1527 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 1528 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 1529 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 1530 * halo. It would be necessary to clamp the argument to prevent this. 1531 * 1532 * See also: 1533 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 1534 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1535 */ 1536static INLINE LLVMValueRef 1537lp_build_rcp_refine(struct lp_build_context *bld, 1538 LLVMValueRef a, 1539 LLVMValueRef rcp_a) 1540{ 1541 LLVMBuilderRef builder = bld->gallivm->builder; 1542 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 1543 LLVMValueRef res; 1544 1545 res = LLVMBuildFMul(builder, a, rcp_a, ""); 1546 res = LLVMBuildFSub(builder, two, res, ""); 1547 res = LLVMBuildFMul(builder, rcp_a, res, ""); 1548 1549 return res; 1550} 1551 1552 1553LLVMValueRef 1554lp_build_rcp(struct lp_build_context *bld, 1555 LLVMValueRef a) 1556{ 1557 LLVMBuilderRef builder = bld->gallivm->builder; 1558 const struct lp_type type = bld->type; 1559 1560 assert(lp_check_value(type, a)); 1561 1562 if(a == bld->zero) 1563 return bld->undef; 1564 if(a == bld->one) 1565 return bld->one; 1566 if(a == bld->undef) 1567 return bld->undef; 1568 1569 assert(type.floating); 1570 1571 if(LLVMIsConstant(a)) 1572 return LLVMConstFDiv(bld->one, a); 1573 1574 /* 1575 * We don't use RCPPS because: 1576 * - it only has 10bits of precision 1577 * - it doesn't even get the reciprocate of 1.0 exactly 1578 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 1579 * - for recent processors the benefit over DIVPS is marginal, a case 1580 * depedent 1581 * 1582 * We could still use it on certain processors if benchmarks show that the 1583 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 1584 * particular uses that require less workarounds. 1585 */ 1586 1587 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1588 const unsigned num_iterations = 0; 1589 LLVMValueRef res; 1590 unsigned i; 1591 1592 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); 1593 1594 for (i = 0; i < num_iterations; ++i) { 1595 res = lp_build_rcp_refine(bld, a, res); 1596 } 1597 1598 return res; 1599 } 1600 1601 return LLVMBuildFDiv(builder, bld->one, a, ""); 1602} 1603 1604 1605/** 1606 * Do one Newton-Raphson step to improve rsqrt precision: 1607 * 1608 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 1609 * 1610 * See also: 1611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1612 */ 1613static INLINE LLVMValueRef 1614lp_build_rsqrt_refine(struct lp_build_context *bld, 1615 LLVMValueRef a, 1616 LLVMValueRef rsqrt_a) 1617{ 1618 LLVMBuilderRef builder = bld->gallivm->builder; 1619 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 1620 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 1621 LLVMValueRef res; 1622 1623 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 1624 res = LLVMBuildFMul(builder, a, res, ""); 1625 res = LLVMBuildFSub(builder, three, res, ""); 1626 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 1627 res = LLVMBuildFMul(builder, half, res, ""); 1628 1629 return res; 1630} 1631 1632 1633/** 1634 * Generate 1/sqrt(a) 1635 */ 1636LLVMValueRef 1637lp_build_rsqrt(struct lp_build_context *bld, 1638 LLVMValueRef a) 1639{ 1640 LLVMBuilderRef builder = bld->gallivm->builder; 1641 const struct lp_type type = bld->type; 1642 1643 assert(lp_check_value(type, a)); 1644 1645 assert(type.floating); 1646 1647 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { 1648 const unsigned num_iterations = 1; 1649 LLVMValueRef res; 1650 unsigned i; 1651 1652 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); 1653 1654 for (i = 0; i < num_iterations; ++i) { 1655 res = lp_build_rsqrt_refine(bld, a, res); 1656 } 1657 1658 return res; 1659 } 1660 1661 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 1662} 1663 1664 1665/** 1666 * Generate sin(a) using SSE2 1667 */ 1668LLVMValueRef 1669lp_build_sin(struct lp_build_context *bld, 1670 LLVMValueRef a) 1671{ 1672 struct gallivm_state *gallivm = bld->gallivm; 1673 LLVMBuilderRef builder = gallivm->builder; 1674 struct lp_type int_type = lp_int_type(bld->type); 1675 LLVMBuilderRef b = builder; 1676 1677 /* 1678 * take the absolute value, 1679 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1680 */ 1681 1682 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 1683 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 1684 1685 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1686 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 1687 1688 /* 1689 * extract the sign bit (upper one) 1690 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 1691 */ 1692 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 1693 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 1694 1695 /* 1696 * scale by 4/Pi 1697 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1698 */ 1699 1700 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 1701 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1702 1703 /* 1704 * store the integer part of y in mm0 1705 * emm2 = _mm_cvttps_epi32(y); 1706 */ 1707 1708 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 1709 1710 /* 1711 * j=(j+1) & (~1) (see the cephes sources) 1712 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1713 */ 1714 1715 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 1716 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1717 /* 1718 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1719 */ 1720 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 1721 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1722 1723 /* 1724 * y = _mm_cvtepi32_ps(emm2); 1725 */ 1726 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 1727 1728 /* get the swap sign flag 1729 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 1730 */ 1731 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 1732 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 1733 1734 /* 1735 * emm2 = _mm_slli_epi32(emm0, 29); 1736 */ 1737 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 1738 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 1739 1740 /* 1741 * get the polynom selection mask 1742 * there is one polynom for 0 <= x <= Pi/4 1743 * and another one for Pi/4<x<=Pi/2 1744 * Both branches will be computed. 1745 * 1746 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1747 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1748 */ 1749 1750 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1751 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 1752 LLVMValueRef poly_mask = lp_build_compare(gallivm, 1753 int_type, PIPE_FUNC_EQUAL, 1754 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 1755 /* 1756 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 1757 */ 1758 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 1759 1760 /* 1761 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1762 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1763 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1764 */ 1765 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 1766 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 1767 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 1768 1769 /* 1770 * The magic pass: "Extended precision modular arithmetic" 1771 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1772 * xmm1 = _mm_mul_ps(y, xmm1); 1773 * xmm2 = _mm_mul_ps(y, xmm2); 1774 * xmm3 = _mm_mul_ps(y, xmm3); 1775 */ 1776 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1777 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1778 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1779 1780 /* 1781 * x = _mm_add_ps(x, xmm1); 1782 * x = _mm_add_ps(x, xmm2); 1783 * x = _mm_add_ps(x, xmm3); 1784 */ 1785 1786 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 1787 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 1788 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 1789 1790 /* 1791 * Evaluate the first polynom (0 <= x <= Pi/4) 1792 * 1793 * z = _mm_mul_ps(x,x); 1794 */ 1795 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 1796 1797 /* 1798 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 1799 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 1800 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 1801 */ 1802 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 1803 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 1804 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 1805 1806 /* 1807 * y = *(v4sf*)_ps_coscof_p0; 1808 * y = _mm_mul_ps(y, z); 1809 */ 1810 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 1811 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 1812 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 1813 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 1814 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 1815 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 1816 1817 1818 /* 1819 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 1820 * y = _mm_sub_ps(y, tmp); 1821 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 1822 */ 1823 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 1824 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 1825 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 1826 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 1827 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 1828 1829 /* 1830 * _PS_CONST(sincof_p0, -1.9515295891E-4); 1831 * _PS_CONST(sincof_p1, 8.3321608736E-3); 1832 * _PS_CONST(sincof_p2, -1.6666654611E-1); 1833 */ 1834 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 1835 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 1836 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 1837 1838 /* 1839 * Evaluate the second polynom (Pi/4 <= x <= 0) 1840 * 1841 * y2 = *(v4sf*)_ps_sincof_p0; 1842 * y2 = _mm_mul_ps(y2, z); 1843 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 1844 * y2 = _mm_mul_ps(y2, z); 1845 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 1846 * y2 = _mm_mul_ps(y2, z); 1847 * y2 = _mm_mul_ps(y2, x); 1848 * y2 = _mm_add_ps(y2, x); 1849 */ 1850 1851 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 1852 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 1853 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 1854 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 1855 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 1856 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 1857 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 1858 1859 /* 1860 * select the correct result from the two polynoms 1861 * xmm3 = poly_mask; 1862 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 1863 * y = _mm_andnot_ps(xmm3, y); 1864 * y = _mm_add_ps(y,y2); 1865 */ 1866 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 1867 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 1868 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 1869 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 1870 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 1871 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 1872 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 1873 1874 /* 1875 * update the sign 1876 * y = _mm_xor_ps(y, sign_bit); 1877 */ 1878 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 1879 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 1880 return y_result; 1881} 1882 1883 1884/** 1885 * Generate cos(a) using SSE2 1886 */ 1887LLVMValueRef 1888lp_build_cos(struct lp_build_context *bld, 1889 LLVMValueRef a) 1890{ 1891 struct gallivm_state *gallivm = bld->gallivm; 1892 LLVMBuilderRef builder = gallivm->builder; 1893 struct lp_type int_type = lp_int_type(bld->type); 1894 LLVMBuilderRef b = builder; 1895 1896 /* 1897 * take the absolute value, 1898 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 1899 */ 1900 1901 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 1902 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 1903 1904 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 1905 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 1906 1907 /* 1908 * scale by 4/Pi 1909 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 1910 */ 1911 1912 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 1913 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 1914 1915 /* 1916 * store the integer part of y in mm0 1917 * emm2 = _mm_cvttps_epi32(y); 1918 */ 1919 1920 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 1921 1922 /* 1923 * j=(j+1) & (~1) (see the cephes sources) 1924 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 1925 */ 1926 1927 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 1928 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 1929 /* 1930 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 1931 */ 1932 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 1933 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 1934 1935 /* 1936 * y = _mm_cvtepi32_ps(emm2); 1937 */ 1938 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 1939 1940 1941 /* 1942 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 1943 */ 1944 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1945 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 1946 1947 1948 /* get the swap sign flag 1949 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 1950 */ 1951 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 1952 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 1953 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 1954 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 1955 1956 /* 1957 * emm2 = _mm_slli_epi32(emm0, 29); 1958 */ 1959 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 1960 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 1961 1962 /* 1963 * get the polynom selection mask 1964 * there is one polynom for 0 <= x <= Pi/4 1965 * and another one for Pi/4<x<=Pi/2 1966 * Both branches will be computed. 1967 * 1968 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 1969 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 1970 */ 1971 1972 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 1973 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 1974 LLVMValueRef poly_mask = lp_build_compare(gallivm, 1975 int_type, PIPE_FUNC_EQUAL, 1976 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 1977 1978 /* 1979 * _PS_CONST(minus_cephes_DP1, -0.78515625); 1980 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 1981 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 1982 */ 1983 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 1984 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 1985 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 1986 1987 /* 1988 * The magic pass: "Extended precision modular arithmetic" 1989 * x = ((x - y * DP1) - y * DP2) - y * DP3; 1990 * xmm1 = _mm_mul_ps(y, xmm1); 1991 * xmm2 = _mm_mul_ps(y, xmm2); 1992 * xmm3 = _mm_mul_ps(y, xmm3); 1993 */ 1994 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 1995 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 1996 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 1997 1998 /* 1999 * x = _mm_add_ps(x, xmm1); 2000 * x = _mm_add_ps(x, xmm2); 2001 * x = _mm_add_ps(x, xmm3); 2002 */ 2003 2004 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2005 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2006 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2007 2008 /* 2009 * Evaluate the first polynom (0 <= x <= Pi/4) 2010 * 2011 * z = _mm_mul_ps(x,x); 2012 */ 2013 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2014 2015 /* 2016 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2017 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2018 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2019 */ 2020 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2021 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2022 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2023 2024 /* 2025 * y = *(v4sf*)_ps_coscof_p0; 2026 * y = _mm_mul_ps(y, z); 2027 */ 2028 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2029 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2030 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2031 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2032 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2033 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2034 2035 2036 /* 2037 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2038 * y = _mm_sub_ps(y, tmp); 2039 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2040 */ 2041 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2042 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2043 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2044 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2045 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2046 2047 /* 2048 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2049 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2050 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2051 */ 2052 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2053 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2054 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2055 2056 /* 2057 * Evaluate the second polynom (Pi/4 <= x <= 0) 2058 * 2059 * y2 = *(v4sf*)_ps_sincof_p0; 2060 * y2 = _mm_mul_ps(y2, z); 2061 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2062 * y2 = _mm_mul_ps(y2, z); 2063 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2064 * y2 = _mm_mul_ps(y2, z); 2065 * y2 = _mm_mul_ps(y2, x); 2066 * y2 = _mm_add_ps(y2, x); 2067 */ 2068 2069 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2070 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2071 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2072 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2073 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2074 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2075 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2076 2077 /* 2078 * select the correct result from the two polynoms 2079 * xmm3 = poly_mask; 2080 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2081 * y = _mm_andnot_ps(xmm3, y); 2082 * y = _mm_add_ps(y,y2); 2083 */ 2084 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2085 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2086 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2087 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2088 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2089 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2090 2091 /* 2092 * update the sign 2093 * y = _mm_xor_ps(y, sign_bit); 2094 */ 2095 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 2096 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2097 return y_result; 2098} 2099 2100 2101/** 2102 * Generate pow(x, y) 2103 */ 2104LLVMValueRef 2105lp_build_pow(struct lp_build_context *bld, 2106 LLVMValueRef x, 2107 LLVMValueRef y) 2108{ 2109 /* TODO: optimize the constant case */ 2110 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2111 LLVMIsConstant(x) && LLVMIsConstant(y)) { 2112 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2113 __FUNCTION__); 2114 } 2115 2116 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 2117} 2118 2119 2120/** 2121 * Generate exp(x) 2122 */ 2123LLVMValueRef 2124lp_build_exp(struct lp_build_context *bld, 2125 LLVMValueRef x) 2126{ 2127 /* log2(e) = 1/log(2) */ 2128 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 2129 1.4426950408889634); 2130 2131 assert(lp_check_value(bld->type, x)); 2132 2133 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 2134} 2135 2136 2137/** 2138 * Generate log(x) 2139 */ 2140LLVMValueRef 2141lp_build_log(struct lp_build_context *bld, 2142 LLVMValueRef x) 2143{ 2144 /* log(2) */ 2145 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 2146 0.69314718055994529); 2147 2148 assert(lp_check_value(bld->type, x)); 2149 2150 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 2151} 2152 2153 2154/** 2155 * Generate polynomial. 2156 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 2157 */ 2158static LLVMValueRef 2159lp_build_polynomial(struct lp_build_context *bld, 2160 LLVMValueRef x, 2161 const double *coeffs, 2162 unsigned num_coeffs) 2163{ 2164 const struct lp_type type = bld->type; 2165 LLVMValueRef res = NULL; 2166 unsigned i; 2167 2168 assert(lp_check_value(bld->type, x)); 2169 2170 /* TODO: optimize the constant case */ 2171 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2172 LLVMIsConstant(x)) { 2173 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2174 __FUNCTION__); 2175 } 2176 2177 for (i = num_coeffs; i--; ) { 2178 LLVMValueRef coeff; 2179 2180 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 2181 2182 if(res) 2183 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); 2184 else 2185 res = coeff; 2186 } 2187 2188 if(res) 2189 return res; 2190 else 2191 return bld->undef; 2192} 2193 2194 2195/** 2196 * Minimax polynomial fit of 2**x, in range [0, 1[ 2197 */ 2198const double lp_build_exp2_polynomial[] = { 2199#if EXP_POLY_DEGREE == 5 2200 0.999999925063526176901, 2201 0.693153073200168932794, 2202 0.240153617044375388211, 2203 0.0558263180532956664775, 2204 0.00898934009049466391101, 2205 0.00187757667519147912699 2206#elif EXP_POLY_DEGREE == 4 2207 1.00000259337069434683, 2208 0.693003834469974940458, 2209 0.24144275689150793076, 2210 0.0520114606103070150235, 2211 0.0135341679161270268764 2212#elif EXP_POLY_DEGREE == 3 2213 0.999925218562710312959, 2214 0.695833540494823811697, 2215 0.226067155427249155588, 2216 0.0780245226406372992967 2217#elif EXP_POLY_DEGREE == 2 2218 1.00172476321474503578, 2219 0.657636275736077639316, 2220 0.33718943461968720704 2221#else 2222#error 2223#endif 2224}; 2225 2226 2227void 2228lp_build_exp2_approx(struct lp_build_context *bld, 2229 LLVMValueRef x, 2230 LLVMValueRef *p_exp2_int_part, 2231 LLVMValueRef *p_frac_part, 2232 LLVMValueRef *p_exp2) 2233{ 2234 LLVMBuilderRef builder = bld->gallivm->builder; 2235 const struct lp_type type = bld->type; 2236 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2237 LLVMValueRef ipart = NULL; 2238 LLVMValueRef fpart = NULL; 2239 LLVMValueRef expipart = NULL; 2240 LLVMValueRef expfpart = NULL; 2241 LLVMValueRef res = NULL; 2242 2243 assert(lp_check_value(bld->type, x)); 2244 2245 if(p_exp2_int_part || p_frac_part || p_exp2) { 2246 /* TODO: optimize the constant case */ 2247 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2248 LLVMIsConstant(x)) { 2249 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2250 __FUNCTION__); 2251 } 2252 2253 assert(type.floating && type.width == 32); 2254 2255 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0)); 2256 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999)); 2257 2258 /* ipart = floor(x) */ 2259 /* fpart = x - ipart */ 2260 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 2261 } 2262 2263 if(p_exp2_int_part || p_exp2) { 2264 /* expipart = (float) (1 << ipart) */ 2265 expipart = LLVMBuildAdd(builder, ipart, 2266 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2267 expipart = LLVMBuildShl(builder, expipart, 2268 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2269 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 2270 } 2271 2272 if(p_exp2) { 2273 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 2274 Elements(lp_build_exp2_polynomial)); 2275 2276 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 2277 } 2278 2279 if(p_exp2_int_part) 2280 *p_exp2_int_part = expipart; 2281 2282 if(p_frac_part) 2283 *p_frac_part = fpart; 2284 2285 if(p_exp2) 2286 *p_exp2 = res; 2287} 2288 2289 2290LLVMValueRef 2291lp_build_exp2(struct lp_build_context *bld, 2292 LLVMValueRef x) 2293{ 2294 LLVMValueRef res; 2295 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 2296 return res; 2297} 2298 2299 2300/** 2301 * Extract the exponent of a IEEE-754 floating point value. 2302 * 2303 * Optionally apply an integer bias. 2304 * 2305 * Result is an integer value with 2306 * 2307 * ifloor(log2(x)) + bias 2308 */ 2309LLVMValueRef 2310lp_build_extract_exponent(struct lp_build_context *bld, 2311 LLVMValueRef x, 2312 int bias) 2313{ 2314 LLVMBuilderRef builder = bld->gallivm->builder; 2315 const struct lp_type type = bld->type; 2316 unsigned mantissa = lp_mantissa(type); 2317 LLVMValueRef res; 2318 2319 assert(type.floating); 2320 2321 assert(lp_check_value(bld->type, x)); 2322 2323 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2324 2325 res = LLVMBuildLShr(builder, x, 2326 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 2327 res = LLVMBuildAnd(builder, res, 2328 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 2329 res = LLVMBuildSub(builder, res, 2330 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 2331 2332 return res; 2333} 2334 2335 2336/** 2337 * Extract the mantissa of the a floating. 2338 * 2339 * Result is a floating point value with 2340 * 2341 * x / floor(log2(x)) 2342 */ 2343LLVMValueRef 2344lp_build_extract_mantissa(struct lp_build_context *bld, 2345 LLVMValueRef x) 2346{ 2347 LLVMBuilderRef builder = bld->gallivm->builder; 2348 const struct lp_type type = bld->type; 2349 unsigned mantissa = lp_mantissa(type); 2350 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 2351 (1ULL << mantissa) - 1); 2352 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 2353 LLVMValueRef res; 2354 2355 assert(lp_check_value(bld->type, x)); 2356 2357 assert(type.floating); 2358 2359 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2360 2361 /* res = x / 2**ipart */ 2362 res = LLVMBuildAnd(builder, x, mantmask, ""); 2363 res = LLVMBuildOr(builder, res, one, ""); 2364 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 2365 2366 return res; 2367} 2368 2369 2370 2371/** 2372 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 2373 * These coefficients can be generate with 2374 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 2375 */ 2376const double lp_build_log2_polynomial[] = { 2377#if LOG_POLY_DEGREE == 6 2378 3.11578814719469302614, 2379 -3.32419399085241980044, 2380 2.59883907202499966007, 2381 -1.23152682416275988241, 2382 0.318212422185251071475, 2383 -0.0344359067839062357313 2384#elif LOG_POLY_DEGREE == 5 2385 2.8882704548164776201, 2386 -2.52074962577807006663, 2387 1.48116647521213171641, 2388 -0.465725644288844778798, 2389 0.0596515482674574969533 2390#elif LOG_POLY_DEGREE == 4 2391 2.61761038894603480148, 2392 -1.75647175389045657003, 2393 0.688243882994381274313, 2394 -0.107254423828329604454 2395#elif LOG_POLY_DEGREE == 3 2396 2.28330284476918490682, 2397 -1.04913055217340124191, 2398 0.204446009836232697516 2399#else 2400#error 2401#endif 2402}; 2403 2404 2405/** 2406 * See http://www.devmaster.net/forums/showthread.php?p=43580 2407 */ 2408void 2409lp_build_log2_approx(struct lp_build_context *bld, 2410 LLVMValueRef x, 2411 LLVMValueRef *p_exp, 2412 LLVMValueRef *p_floor_log2, 2413 LLVMValueRef *p_log2) 2414{ 2415 LLVMBuilderRef builder = bld->gallivm->builder; 2416 const struct lp_type type = bld->type; 2417 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2418 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2419 2420 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 2421 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 2422 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 2423 2424 LLVMValueRef i = NULL; 2425 LLVMValueRef exp = NULL; 2426 LLVMValueRef mant = NULL; 2427 LLVMValueRef logexp = NULL; 2428 LLVMValueRef logmant = NULL; 2429 LLVMValueRef res = NULL; 2430 2431 assert(lp_check_value(bld->type, x)); 2432 2433 if(p_exp || p_floor_log2 || p_log2) { 2434 /* TODO: optimize the constant case */ 2435 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2436 LLVMIsConstant(x)) { 2437 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2438 __FUNCTION__); 2439 } 2440 2441 assert(type.floating && type.width == 32); 2442 2443 /* 2444 * We don't explicitly handle denormalized numbers. They will yield a 2445 * result in the neighbourhood of -127, which appears to be adequate 2446 * enough. 2447 */ 2448 2449 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 2450 2451 /* exp = (float) exponent(x) */ 2452 exp = LLVMBuildAnd(builder, i, expmask, ""); 2453 } 2454 2455 if(p_floor_log2 || p_log2) { 2456 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2457 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2458 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 2459 } 2460 2461 if(p_log2) { 2462 /* mant = (float) mantissa(x) */ 2463 mant = LLVMBuildAnd(builder, i, mantmask, ""); 2464 mant = LLVMBuildOr(builder, mant, one, ""); 2465 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 2466 2467 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, 2468 Elements(lp_build_log2_polynomial)); 2469 2470 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ 2471 logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), ""); 2472 2473 res = LLVMBuildFAdd(builder, logmant, logexp, ""); 2474 } 2475 2476 if(p_exp) { 2477 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 2478 *p_exp = exp; 2479 } 2480 2481 if(p_floor_log2) 2482 *p_floor_log2 = logexp; 2483 2484 if(p_log2) 2485 *p_log2 = res; 2486} 2487 2488 2489LLVMValueRef 2490lp_build_log2(struct lp_build_context *bld, 2491 LLVMValueRef x) 2492{ 2493 LLVMValueRef res; 2494 lp_build_log2_approx(bld, x, NULL, NULL, &res); 2495 return res; 2496} 2497 2498 2499/** 2500 * Faster (and less accurate) log2. 2501 * 2502 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 2503 * 2504 * Piece-wise linear approximation, with exact results when x is a 2505 * power of two. 2506 * 2507 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 2508 */ 2509LLVMValueRef 2510lp_build_fast_log2(struct lp_build_context *bld, 2511 LLVMValueRef x) 2512{ 2513 LLVMBuilderRef builder = bld->gallivm->builder; 2514 LLVMValueRef ipart; 2515 LLVMValueRef fpart; 2516 2517 assert(lp_check_value(bld->type, x)); 2518 2519 assert(bld->type.floating); 2520 2521 /* ipart = floor(log2(x)) - 1 */ 2522 ipart = lp_build_extract_exponent(bld, x, -1); 2523 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 2524 2525 /* fpart = x / 2**ipart */ 2526 fpart = lp_build_extract_mantissa(bld, x); 2527 2528 /* ipart + fpart */ 2529 return LLVMBuildFAdd(builder, ipart, fpart, ""); 2530} 2531 2532 2533/** 2534 * Fast implementation of iround(log2(x)). 2535 * 2536 * Not an approximation -- it should give accurate results all the time. 2537 */ 2538LLVMValueRef 2539lp_build_ilog2(struct lp_build_context *bld, 2540 LLVMValueRef x) 2541{ 2542 LLVMBuilderRef builder = bld->gallivm->builder; 2543 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 2544 LLVMValueRef ipart; 2545 2546 assert(bld->type.floating); 2547 2548 assert(lp_check_value(bld->type, x)); 2549 2550 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 2551 x = LLVMBuildFMul(builder, x, sqrt2, ""); 2552 2553 /* ipart = floor(log2(x) + 0.5) */ 2554 ipart = lp_build_extract_exponent(bld, x, 0); 2555 2556 return ipart; 2557} 2558