lp_bld_arit.c revision dbadd395082d1c812733dff8c873f475d82c63e4
1cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com/**************************************************************************
2cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com *
3cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * Copyright 2009-2010 VMware, Inc.
4cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * All Rights Reserved.
5cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com *
6cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * Permission is hereby granted, free of charge, to any person obtaining a
7cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * copy of this software and associated documentation files (the
8cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * "Software"), to deal in the Software without restriction, including
9cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.com * without limitation the rights to use, copy, modify, merge, publish,
108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * distribute, sub license, and/or sell copies of the Software, and to
118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * permit persons to whom the Software is furnished to do so, subject to
128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * the following conditions:
138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * The above copyright notice and this permission notice (including the
158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * next paragraph) shall be included in all copies or substantial portions
168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * of the Software.
178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com **************************************************************************/
278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @file
318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Helper
328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * LLVM IR doesn't support all basic arithmetic operations we care about (most
348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * notably min/max and saturated operations), and it is often necessary to
358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * resort machine-specific intrinsics directly. The functions here hide all
368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * these implementation details from the other modules.
378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * We also do simple expressions simplification here. Reasons are:
398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - it is very easy given we have all necessary information readily available
408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - LLVM optimization passes fail to simplify several vector expressions
418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - We often know value constraints which the optimization passes have no way
428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *   of knowing, such as when source arguments are known to be in [0, 1] range.
438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @author Jose Fonseca <jfonseca@vmware.com>
458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_memory.h"
498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_debug.h"
508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_math.h"
518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_string.h"
528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "util/u_cpu_detect.h"
538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_type.h"
558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_const.h"
568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_init.h"
578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_intr.h"
588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_logic.h"
598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_pack.h"
608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_debug.h"
618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#include "lp_bld_arit.h"
628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#define EXP_POLY_DEGREE 5
658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#define LOG_POLY_DEGREE 5
678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate min(a, b)
718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * No checks for special case values of a or b = 1 or 0 are done.
728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comstatic LLVMValueRef
748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_min_simple(struct lp_build_context *bld,
758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                    LLVMValueRef a,
768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                    LLVMValueRef b)
778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const char *intrinsic = NULL;
818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef cond;
828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, b));
858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   /* TODO: optimize the constant case */
878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(type.width * type.length == 128) {
898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(type.floating) {
908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 32 && util_cpu_caps.has_sse)
918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse.min.ps";
928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 64 && util_cpu_caps.has_sse2)
938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse2.min.pd";
948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else {
968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse2.pminu.b";
988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pminsb";
1008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
1018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pminuw";
1028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
1038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse2.pmins.w";
1048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
1058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pminud";
1068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
1078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pminsd";
1088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
1098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
1108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(intrinsic)
1128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
1138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
1158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return lp_build_select(bld, cond, a, b);
1168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
1178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
1208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate max(a, b)
1218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * No checks for special case values of a or b = 1 or 0 are done.
1228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
1238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comstatic LLVMValueRef
1248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_max_simple(struct lp_build_context *bld,
1258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                    LLVMValueRef a,
1268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                    LLVMValueRef b)
1278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
1288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
1298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
1308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const char *intrinsic = NULL;
1318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef cond;
1328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
1348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, b));
1358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   /* TODO: optimize the constant case */
1378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(type.width * type.length == 128) {
1398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(type.floating) {
1408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 32 && util_cpu_caps.has_sse)
1418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse.max.ps";
1428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 64 && util_cpu_caps.has_sse2)
1438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse2.max.pd";
1448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
1458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else {
1468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
1478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse2.pmaxu.b";
1488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
1498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pmaxsb";
1508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
1518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pmaxuw";
1528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
1538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse2.pmaxs.w";
1548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
1558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pmaxud";
1568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
1578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = "llvm.x86.sse41.pmaxsd";
1588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
1598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
1608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(intrinsic)
1628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
1638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
1658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return lp_build_select(bld, cond, a, b);
1668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
1678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
1708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate 1 - a, or ~a depending on bld->type.
1718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
1728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef
1738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_comp(struct lp_build_context *bld,
1748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com              LLVMValueRef a)
1758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
1768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
1778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
1788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
1808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->one)
1828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->zero;
1838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->zero)
1848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->one;
1858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(type.norm && !type.floating && !type.fixed && !type.sign) {
1878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(LLVMIsConstant(a))
1888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return LLVMConstNot(a);
1898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
1908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return LLVMBuildNot(builder, a, "");
1918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
1928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
1938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(LLVMIsConstant(a))
1948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
1958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com          return LLVMConstFSub(bld->one, a);
1968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
1978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com          return LLVMConstSub(bld->one, a);
1988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   else
1998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
2008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return LLVMBuildFSub(builder, bld->one, a, "");
2018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
2028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return LLVMBuildSub(builder, bld->one, a, "");
2038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
2048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
2078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate a + b
2088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
2098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef
2108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_add(struct lp_build_context *bld,
2118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com             LLVMValueRef a,
2128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com             LLVMValueRef b)
2138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
2148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
2158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
2168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef res;
2178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
2198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, b));
2208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->zero)
2228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return b;
2238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(b == bld->zero)
2248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return a;
2258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->undef || b == bld->undef)
2268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->undef;
2278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(bld->type.norm) {
2298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      const char *intrinsic = NULL;
2308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(a == bld->one || b == bld->one)
2328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com        return bld->one;
2338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(util_cpu_caps.has_sse2 &&
2358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         type.width * type.length == 128 &&
2368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         !type.floating && !type.fixed) {
2378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 8)
2388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
2398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 16)
2408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
2418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
2428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(intrinsic)
2448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
2458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
2468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(LLVMIsConstant(a) && LLVMIsConstant(b))
2488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
2498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMConstFAdd(a, b);
2508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
2518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMConstAdd(a, b);
2528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   else
2538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
2548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildFAdd(builder, a, b, "");
2558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
2568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildAdd(builder, a, b, "");
2578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   /* clamp to ceiling of 1.0 */
2598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
2608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      res = lp_build_min_simple(bld, res, bld->one);
2618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   /* XXX clamp to floor of -1 or 0??? */
2638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return res;
2658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
2668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/** Return the scalar sum of the elements of a */
2698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef
2708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_sum_vector(struct lp_build_context *bld,
2718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                    LLVMValueRef a)
2728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
2738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
2748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
2758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef index, res;
2768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   unsigned i;
2778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
2798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if (type.length == 1) {
2818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return a;
2828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
2838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(!bld->type.norm);
2858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   index = lp_build_const_int32(bld->gallivm, 0);
2878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   res = LLVMBuildExtractElement(builder, a, index, "");
2888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
2898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   for (i = 1; i < type.length; i++) {
2908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      index = lp_build_const_int32(bld->gallivm, i);
2918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
2928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildFAdd(builder, res,
2938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                            LLVMBuildExtractElement(builder,
2948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                                                    a, index, ""),
2958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                            "");
2968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
2978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildAdd(builder, res,
2988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                            LLVMBuildExtractElement(builder,
2998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                                                    a, index, ""),
3008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                            "");
3018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
3028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return res;
3048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
3058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
3088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate a - b
3098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
3108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef
3118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_sub(struct lp_build_context *bld,
3128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com             LLVMValueRef a,
3138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com             LLVMValueRef b)
3148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
3158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
3168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
3178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef res;
3188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
3208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, b));
3218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(b == bld->zero)
3238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return a;
3248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->undef || b == bld->undef)
3258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->undef;
3268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == b)
3278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->zero;
3288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(bld->type.norm) {
3308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      const char *intrinsic = NULL;
3318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(b == bld->one)
3338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com        return bld->zero;
3348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(util_cpu_caps.has_sse2 &&
3368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         type.width * type.length == 128 &&
3378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         !type.floating && !type.fixed) {
3388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 8)
3398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
3408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.width == 16)
3418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
3428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
3438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(intrinsic)
3458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
3468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
3478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(LLVMIsConstant(a) && LLVMIsConstant(b))
3498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
3508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMConstFSub(a, b);
3518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
3528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMConstSub(a, b);
3538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   else
3548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
3558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildFSub(builder, a, b, "");
3568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
3578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildSub(builder, a, b, "");
3588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
3608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      res = lp_build_max_simple(bld, res, bld->zero);
3618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return res;
3638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
3648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
3668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
3678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Normalized 8bit multiplication.
3688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - alpha plus one
3708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     makes the following approximation to the division (Sree)
3728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *       a*b/255 ~= (a*(b + 1)) >> 256
3748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     which is the fastest method that satisfies the following OpenGL criteria
3768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *       0*0 = 0 and 255*255 = 255
3788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - geometric series
3808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     takes the geometric series approximation to the division
3828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
3848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     in this case just the first two terms to fit in 16bit arithmetic
3868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *       t/255 ~= (t + (t >> 8)) >> 8
3888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     note that just by itself it doesn't satisfies the OpenGL criteria, as
3908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
3918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     must be used
3928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * - geometric series plus rounding
3948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     when using a geometric series division instead of truncating the result
3968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     use roundoff in the approximation (Jim Blinn)
3978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
3988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
3998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
4008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     achieving the exact results
4018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *
4028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
4038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
4048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * @sa Michael Herf, The "double blend trick", May 2000,
4058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com *     http://www.stereopsis.com/doubleblend.html
4068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
4078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comstatic LLVMValueRef
4088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_mul_u8n(struct gallivm_state *gallivm,
4098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                 struct lp_type i16_type,
4108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com                 LLVMValueRef a, LLVMValueRef b)
4118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
4128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = gallivm->builder;
4138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef c8;
4148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef ab;
4158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(!i16_type.floating);
4178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(i16_type, a));
4188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(i16_type, b));
4198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
4218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#if 0
4238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   /* a*b/255 ~= (a*(b + 1)) >> 256 */
4258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
4268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   ab = LLVMBuildMul(builder, a, b, "");
4278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#else
4298cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4308cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
4318cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   ab = LLVMBuildMul(builder, a, b, "");
4328cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
4338cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
4348cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4358cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com#endif
4368cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4378cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   ab = LLVMBuildLShr(builder, ab, c8, "");
4388cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4398cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return ab;
4408cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
4418cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4428cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4438cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
4448cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Generate a * b
4458cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
4468cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef
4478cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comlp_build_mul(struct lp_build_context *bld,
4488cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com             LLVMValueRef a,
4498cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com             LLVMValueRef b)
4508cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com{
4518cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMBuilderRef builder = bld->gallivm->builder;
4528cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   const struct lp_type type = bld->type;
4538cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef shift;
4548cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   LLVMValueRef res;
4558cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4568cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, a));
4578cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   assert(lp_check_value(type, b));
4588cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4598cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->zero)
4608cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->zero;
4618cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->one)
4628cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return b;
4638cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(b == bld->zero)
4648cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->zero;
4658cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(b == bld->one)
4668cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return a;
4678cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(a == bld->undef || b == bld->undef)
4688cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      return bld->undef;
4698cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4708cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(!type.floating && !type.fixed && type.norm) {
4718cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(type.width == 8) {
4728cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         struct lp_type i16_type = lp_wider_type(type);
4738cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
4748cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4758cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
4768cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
4778cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4788cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         /* PMULLW, PSRLW, PADDW */
4798cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
4808cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
4818cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4828cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
4838cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4848cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         return ab;
4858cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
4868cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4878cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      /* FIXME */
4888cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      assert(0);
4898cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
4908cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4918cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(type.fixed)
4928cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
4938cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   else
4948cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      shift = NULL;
4958cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
4968cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
4978cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
4988cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMConstFMul(a, b);
4998cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
5008cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMConstMul(a, b);
5018cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(shift) {
5028cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.sign)
5038cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            res = LLVMConstAShr(res, shift);
5048cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         else
5058cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            res = LLVMConstLShr(res, shift);
5068cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
5078cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
5088cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   else {
5098cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if (type.floating)
5108cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildFMul(builder, a, b, "");
5118cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      else
5128cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         res = LLVMBuildMul(builder, a, b, "");
5138cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      if(shift) {
5148cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         if(type.sign)
5158cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            res = LLVMBuildAShr(builder, res, shift, "");
5168cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com         else
5178cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com            res = LLVMBuildLShr(builder, res, shift, "");
5188cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com      }
5198cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   }
5208cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
5218cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com   return res;
5228cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com}
5238cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
5248cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com
5258cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com/**
5268cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com * Small vector x scale multiplication optimization.
5278cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.com */
5288cee797901763ab0922eb9ef484cfdcbc94bee54edisonn@google.comLLVMValueRef
529cf2cfa174ca878c144e17e9fc60ca8e9070d7dededisonn@google.comlp_build_mul_imm(struct lp_build_context *bld,
530                 LLVMValueRef a,
531                 int b)
532{
533   LLVMBuilderRef builder = bld->gallivm->builder;
534   LLVMValueRef factor;
535
536   assert(lp_check_value(bld->type, a));
537
538   if(b == 0)
539      return bld->zero;
540
541   if(b == 1)
542      return a;
543
544   if(b == -1)
545      return lp_build_negate(bld, a);
546
547   if(b == 2 && bld->type.floating)
548      return lp_build_add(bld, a, a);
549
550   if(util_is_power_of_two(b)) {
551      unsigned shift = ffs(b) - 1;
552
553      if(bld->type.floating) {
554#if 0
555         /*
556          * Power of two multiplication by directly manipulating the mantissa.
557          *
558          * XXX: This might not be always faster, it will introduce a small error
559          * for multiplication by zero, and it will produce wrong results
560          * for Inf and NaN.
561          */
562         unsigned mantissa = lp_mantissa(bld->type);
563         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
564         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
565         a = LLVMBuildAdd(builder, a, factor, "");
566         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
567         return a;
568#endif
569      }
570      else {
571         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
572         return LLVMBuildShl(builder, a, factor, "");
573      }
574   }
575
576   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
577   return lp_build_mul(bld, a, factor);
578}
579
580
581/**
582 * Generate a / b
583 */
584LLVMValueRef
585lp_build_div(struct lp_build_context *bld,
586             LLVMValueRef a,
587             LLVMValueRef b)
588{
589   LLVMBuilderRef builder = bld->gallivm->builder;
590   const struct lp_type type = bld->type;
591
592   assert(lp_check_value(type, a));
593   assert(lp_check_value(type, b));
594
595   if(a == bld->zero)
596      return bld->zero;
597   if(a == bld->one)
598      return lp_build_rcp(bld, b);
599   if(b == bld->zero)
600      return bld->undef;
601   if(b == bld->one)
602      return a;
603   if(a == bld->undef || b == bld->undef)
604      return bld->undef;
605
606   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
607      if (type.floating)
608         return LLVMConstFDiv(a, b);
609      else if (type.sign)
610         return LLVMConstSDiv(a, b);
611      else
612         return LLVMConstUDiv(a, b);
613   }
614
615   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
616      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
617
618   if (type.floating)
619      return LLVMBuildFDiv(builder, a, b, "");
620   else if (type.sign)
621      return LLVMBuildSDiv(builder, a, b, "");
622   else
623      return LLVMBuildUDiv(builder, a, b, "");
624}
625
626
627/**
628 * Linear interpolation -- without any checks.
629 *
630 * @sa http://www.stereopsis.com/doubleblend.html
631 */
632static INLINE LLVMValueRef
633lp_build_lerp_simple(struct lp_build_context *bld,
634                     LLVMValueRef x,
635                     LLVMValueRef v0,
636                     LLVMValueRef v1)
637{
638   LLVMBuilderRef builder = bld->gallivm->builder;
639   LLVMValueRef delta;
640   LLVMValueRef res;
641
642   assert(lp_check_value(bld->type, x));
643   assert(lp_check_value(bld->type, v0));
644   assert(lp_check_value(bld->type, v1));
645
646   delta = lp_build_sub(bld, v1, v0);
647
648   res = lp_build_mul(bld, x, delta);
649
650   res = lp_build_add(bld, v0, res);
651
652   if (bld->type.fixed) {
653      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
654       * but it will be wrong for other uses. Basically we need a more
655       * powerful lp_type, capable of further distinguishing the values
656       * interpretation from the value storage. */
657      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
658   }
659
660   return res;
661}
662
663
664/**
665 * Linear interpolation.
666 */
667LLVMValueRef
668lp_build_lerp(struct lp_build_context *bld,
669              LLVMValueRef x,
670              LLVMValueRef v0,
671              LLVMValueRef v1)
672{
673   LLVMBuilderRef builder = bld->gallivm->builder;
674   const struct lp_type type = bld->type;
675   LLVMValueRef res;
676
677   assert(lp_check_value(type, x));
678   assert(lp_check_value(type, v0));
679   assert(lp_check_value(type, v1));
680
681   if (type.norm) {
682      struct lp_type wide_type;
683      struct lp_build_context wide_bld;
684      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
685      LLVMValueRef shift;
686
687      assert(type.length >= 2);
688      assert(!type.sign);
689
690      /*
691       * Create a wider type, enough to hold the intermediate result of the
692       * multiplication.
693       */
694      memset(&wide_type, 0, sizeof wide_type);
695      wide_type.fixed  = TRUE;
696      wide_type.width  = type.width*2;
697      wide_type.length = type.length/2;
698
699      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
700
701      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
702      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
703      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
704
705      /*
706       * Scale x from [0, 255] to [0, 256]
707       */
708
709      shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
710
711      xl = lp_build_add(&wide_bld, xl,
712                        LLVMBuildAShr(builder, xl, shift, ""));
713      xh = lp_build_add(&wide_bld, xh,
714                        LLVMBuildAShr(builder, xh, shift, ""));
715
716      /*
717       * Lerp both halves.
718       */
719
720      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
721      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
722
723      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
724   } else {
725      res = lp_build_lerp_simple(bld, x, v0, v1);
726   }
727
728   return res;
729}
730
731
732LLVMValueRef
733lp_build_lerp_2d(struct lp_build_context *bld,
734                 LLVMValueRef x,
735                 LLVMValueRef y,
736                 LLVMValueRef v00,
737                 LLVMValueRef v01,
738                 LLVMValueRef v10,
739                 LLVMValueRef v11)
740{
741   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
742   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
743   return lp_build_lerp(bld, y, v0, v1);
744}
745
746
747/**
748 * Generate min(a, b)
749 * Do checks for special cases.
750 */
751LLVMValueRef
752lp_build_min(struct lp_build_context *bld,
753             LLVMValueRef a,
754             LLVMValueRef b)
755{
756   assert(lp_check_value(bld->type, a));
757   assert(lp_check_value(bld->type, b));
758
759   if(a == bld->undef || b == bld->undef)
760      return bld->undef;
761
762   if(a == b)
763      return a;
764
765   if(bld->type.norm) {
766      if(a == bld->zero || b == bld->zero)
767         return bld->zero;
768      if(a == bld->one)
769         return b;
770      if(b == bld->one)
771         return a;
772   }
773
774   return lp_build_min_simple(bld, a, b);
775}
776
777
778/**
779 * Generate max(a, b)
780 * Do checks for special cases.
781 */
782LLVMValueRef
783lp_build_max(struct lp_build_context *bld,
784             LLVMValueRef a,
785             LLVMValueRef b)
786{
787   assert(lp_check_value(bld->type, a));
788   assert(lp_check_value(bld->type, b));
789
790   if(a == bld->undef || b == bld->undef)
791      return bld->undef;
792
793   if(a == b)
794      return a;
795
796   if(bld->type.norm) {
797      if(a == bld->one || b == bld->one)
798         return bld->one;
799      if(a == bld->zero)
800         return b;
801      if(b == bld->zero)
802         return a;
803   }
804
805   return lp_build_max_simple(bld, a, b);
806}
807
808
809/**
810 * Generate clamp(a, min, max)
811 * Do checks for special cases.
812 */
813LLVMValueRef
814lp_build_clamp(struct lp_build_context *bld,
815               LLVMValueRef a,
816               LLVMValueRef min,
817               LLVMValueRef max)
818{
819   assert(lp_check_value(bld->type, a));
820   assert(lp_check_value(bld->type, min));
821   assert(lp_check_value(bld->type, max));
822
823   a = lp_build_min(bld, a, max);
824   a = lp_build_max(bld, a, min);
825   return a;
826}
827
828
829/**
830 * Generate abs(a)
831 */
832LLVMValueRef
833lp_build_abs(struct lp_build_context *bld,
834             LLVMValueRef a)
835{
836   LLVMBuilderRef builder = bld->gallivm->builder;
837   const struct lp_type type = bld->type;
838   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
839
840   assert(lp_check_value(type, a));
841
842   if(!type.sign)
843      return a;
844
845   if(type.floating) {
846      /* Mask out the sign bit */
847      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
848      unsigned long long absMask = ~(1ULL << (type.width - 1));
849      LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
850      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
851      a = LLVMBuildAnd(builder, a, mask, "");
852      a = LLVMBuildBitCast(builder, a, vec_type, "");
853      return a;
854   }
855
856   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
857      switch(type.width) {
858      case 8:
859         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
860      case 16:
861         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
862      case 32:
863         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
864      }
865   }
866
867   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
868}
869
870
871LLVMValueRef
872lp_build_negate(struct lp_build_context *bld,
873                LLVMValueRef a)
874{
875   LLVMBuilderRef builder = bld->gallivm->builder;
876
877   assert(lp_check_value(bld->type, a));
878
879#if HAVE_LLVM >= 0x0207
880   if (bld->type.floating)
881      a = LLVMBuildFNeg(builder, a, "");
882   else
883#endif
884      a = LLVMBuildNeg(builder, a, "");
885
886   return a;
887}
888
889
890/** Return -1, 0 or +1 depending on the sign of a */
891LLVMValueRef
892lp_build_sgn(struct lp_build_context *bld,
893             LLVMValueRef a)
894{
895   LLVMBuilderRef builder = bld->gallivm->builder;
896   const struct lp_type type = bld->type;
897   LLVMValueRef cond;
898   LLVMValueRef res;
899
900   assert(lp_check_value(type, a));
901
902   /* Handle non-zero case */
903   if(!type.sign) {
904      /* if not zero then sign must be positive */
905      res = bld->one;
906   }
907   else if(type.floating) {
908      LLVMTypeRef vec_type;
909      LLVMTypeRef int_type;
910      LLVMValueRef mask;
911      LLVMValueRef sign;
912      LLVMValueRef one;
913      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
914
915      int_type = lp_build_int_vec_type(bld->gallivm, type);
916      vec_type = lp_build_vec_type(bld->gallivm, type);
917      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
918
919      /* Take the sign bit and add it to 1 constant */
920      sign = LLVMBuildBitCast(builder, a, int_type, "");
921      sign = LLVMBuildAnd(builder, sign, mask, "");
922      one = LLVMConstBitCast(bld->one, int_type);
923      res = LLVMBuildOr(builder, sign, one, "");
924      res = LLVMBuildBitCast(builder, res, vec_type, "");
925   }
926   else
927   {
928      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
929      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
930      res = lp_build_select(bld, cond, bld->one, minus_one);
931   }
932
933   /* Handle zero */
934   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
935   res = lp_build_select(bld, cond, bld->zero, res);
936
937   return res;
938}
939
940
941/**
942 * Set the sign of float vector 'a' according to 'sign'.
943 * If sign==0, return abs(a).
944 * If sign==1, return -abs(a);
945 * Other values for sign produce undefined results.
946 */
947LLVMValueRef
948lp_build_set_sign(struct lp_build_context *bld,
949                  LLVMValueRef a, LLVMValueRef sign)
950{
951   LLVMBuilderRef builder = bld->gallivm->builder;
952   const struct lp_type type = bld->type;
953   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
954   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
955   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
956   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
957                             ~((unsigned long long) 1 << (type.width - 1)));
958   LLVMValueRef val, res;
959
960   assert(type.floating);
961   assert(lp_check_value(type, a));
962
963   /* val = reinterpret_cast<int>(a) */
964   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
965   /* val = val & mask */
966   val = LLVMBuildAnd(builder, val, mask, "");
967   /* sign = sign << shift */
968   sign = LLVMBuildShl(builder, sign, shift, "");
969   /* res = val | sign */
970   res = LLVMBuildOr(builder, val, sign, "");
971   /* res = reinterpret_cast<float>(res) */
972   res = LLVMBuildBitCast(builder, res, vec_type, "");
973
974   return res;
975}
976
977
978/**
979 * Convert vector of (or scalar) int to vector of (or scalar) float.
980 */
981LLVMValueRef
982lp_build_int_to_float(struct lp_build_context *bld,
983                      LLVMValueRef a)
984{
985   LLVMBuilderRef builder = bld->gallivm->builder;
986   const struct lp_type type = bld->type;
987   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
988
989   assert(type.floating);
990
991   return LLVMBuildSIToFP(builder, a, vec_type, "");
992}
993
994
995
996enum lp_build_round_sse41_mode
997{
998   LP_BUILD_ROUND_SSE41_NEAREST = 0,
999   LP_BUILD_ROUND_SSE41_FLOOR = 1,
1000   LP_BUILD_ROUND_SSE41_CEIL = 2,
1001   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1002};
1003
1004
1005/**
1006 * Helper for SSE4.1's ROUNDxx instructions.
1007 *
1008 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1009 * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1010 */
1011static INLINE LLVMValueRef
1012lp_build_round_sse41(struct lp_build_context *bld,
1013                     LLVMValueRef a,
1014                     enum lp_build_round_sse41_mode mode)
1015{
1016   LLVMBuilderRef builder = bld->gallivm->builder;
1017   const struct lp_type type = bld->type;
1018   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1019   const char *intrinsic;
1020   LLVMValueRef res;
1021
1022   assert(type.floating);
1023
1024   assert(lp_check_value(type, a));
1025   assert(util_cpu_caps.has_sse4_1);
1026
1027   if (type.length == 1) {
1028      LLVMTypeRef vec_type;
1029      LLVMValueRef undef;
1030      LLVMValueRef args[3];
1031      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1032
1033      switch(type.width) {
1034      case 32:
1035         intrinsic = "llvm.x86.sse41.round.ss";
1036         break;
1037      case 64:
1038         intrinsic = "llvm.x86.sse41.round.sd";
1039         break;
1040      default:
1041         assert(0);
1042         return bld->undef;
1043      }
1044
1045      vec_type = LLVMVectorType(bld->elem_type, 4);
1046
1047      undef = LLVMGetUndef(vec_type);
1048
1049      args[0] = undef;
1050      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1051      args[2] = LLVMConstInt(i32t, mode, 0);
1052
1053      res = lp_build_intrinsic(builder, intrinsic,
1054                               vec_type, args, Elements(args));
1055
1056      res = LLVMBuildExtractElement(builder, res, index0, "");
1057   }
1058   else {
1059      assert(type.width*type.length == 128);
1060
1061      switch(type.width) {
1062      case 32:
1063         intrinsic = "llvm.x86.sse41.round.ps";
1064         break;
1065      case 64:
1066         intrinsic = "llvm.x86.sse41.round.pd";
1067         break;
1068      default:
1069         assert(0);
1070         return bld->undef;
1071      }
1072
1073      res = lp_build_intrinsic_binary(builder, intrinsic,
1074                                      bld->vec_type, a,
1075                                      LLVMConstInt(i32t, mode, 0));
1076   }
1077
1078   return res;
1079}
1080
1081
1082static INLINE LLVMValueRef
1083lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1084                             LLVMValueRef a)
1085{
1086   LLVMBuilderRef builder = bld->gallivm->builder;
1087   const struct lp_type type = bld->type;
1088   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1089   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1090   const char *intrinsic;
1091   LLVMValueRef res;
1092
1093   assert(type.floating);
1094   /* using the double precision conversions is a bit more complicated */
1095   assert(type.width == 32);
1096
1097   assert(lp_check_value(type, a));
1098   assert(util_cpu_caps.has_sse2);
1099
1100   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1101   if (type.length == 1) {
1102      LLVMTypeRef vec_type;
1103      LLVMValueRef undef;
1104      LLVMValueRef arg;
1105      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1106
1107      vec_type = LLVMVectorType(bld->elem_type, 4);
1108
1109      intrinsic = "llvm.x86.sse.cvtss2si";
1110
1111      undef = LLVMGetUndef(vec_type);
1112
1113      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1114
1115      res = lp_build_intrinsic_unary(builder, intrinsic,
1116                                     ret_type, arg);
1117   }
1118   else {
1119      assert(type.width*type.length == 128);
1120
1121      intrinsic = "llvm.x86.sse2.cvtps2dq";
1122
1123      res = lp_build_intrinsic_unary(builder, intrinsic,
1124                                     ret_type, a);
1125   }
1126
1127   return res;
1128}
1129
1130
1131/**
1132 * Return the integer part of a float (vector) value (== round toward zero).
1133 * The returned value is a float (vector).
1134 * Ex: trunc(-1.5) = -1.0
1135 */
1136LLVMValueRef
1137lp_build_trunc(struct lp_build_context *bld,
1138               LLVMValueRef a)
1139{
1140   LLVMBuilderRef builder = bld->gallivm->builder;
1141   const struct lp_type type = bld->type;
1142
1143   assert(type.floating);
1144   assert(lp_check_value(type, a));
1145
1146   if (util_cpu_caps.has_sse4_1 &&
1147       (type.length == 1 || type.width*type.length == 128)) {
1148      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1149   }
1150   else {
1151      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1152      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1153      LLVMValueRef res;
1154      res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1155      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1156      return res;
1157   }
1158}
1159
1160
1161/**
1162 * Return float (vector) rounded to nearest integer (vector).  The returned
1163 * value is a float (vector).
1164 * Ex: round(0.9) = 1.0
1165 * Ex: round(-1.5) = -2.0
1166 */
1167LLVMValueRef
1168lp_build_round(struct lp_build_context *bld,
1169               LLVMValueRef a)
1170{
1171   LLVMBuilderRef builder = bld->gallivm->builder;
1172   const struct lp_type type = bld->type;
1173
1174   assert(type.floating);
1175   assert(lp_check_value(type, a));
1176
1177   if (util_cpu_caps.has_sse4_1 &&
1178       (type.length == 1 || type.width*type.length == 128)) {
1179      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1180   }
1181   else {
1182      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1183      LLVMValueRef res;
1184      res = lp_build_iround(bld, a);
1185      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1186      return res;
1187   }
1188}
1189
1190
1191/**
1192 * Return floor of float (vector), result is a float (vector)
1193 * Ex: floor(1.1) = 1.0
1194 * Ex: floor(-1.1) = -2.0
1195 */
1196LLVMValueRef
1197lp_build_floor(struct lp_build_context *bld,
1198               LLVMValueRef a)
1199{
1200   LLVMBuilderRef builder = bld->gallivm->builder;
1201   const struct lp_type type = bld->type;
1202
1203   assert(type.floating);
1204   assert(lp_check_value(type, a));
1205
1206   if (util_cpu_caps.has_sse4_1 &&
1207       (type.length == 1 || type.width*type.length == 128)) {
1208      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1209   }
1210   else {
1211      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1212      LLVMValueRef res;
1213      res = lp_build_ifloor(bld, a);
1214      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1215      return res;
1216   }
1217}
1218
1219
1220/**
1221 * Return ceiling of float (vector), returning float (vector).
1222 * Ex: ceil( 1.1) = 2.0
1223 * Ex: ceil(-1.1) = -1.0
1224 */
1225LLVMValueRef
1226lp_build_ceil(struct lp_build_context *bld,
1227              LLVMValueRef a)
1228{
1229   LLVMBuilderRef builder = bld->gallivm->builder;
1230   const struct lp_type type = bld->type;
1231
1232   assert(type.floating);
1233   assert(lp_check_value(type, a));
1234
1235   if (util_cpu_caps.has_sse4_1 &&
1236       (type.length == 1 || type.width*type.length == 128)) {
1237      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1238   }
1239   else {
1240      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1241      LLVMValueRef res;
1242      res = lp_build_iceil(bld, a);
1243      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1244      return res;
1245   }
1246}
1247
1248
1249/**
1250 * Return fractional part of 'a' computed as a - floor(a)
1251 * Typically used in texture coord arithmetic.
1252 */
1253LLVMValueRef
1254lp_build_fract(struct lp_build_context *bld,
1255               LLVMValueRef a)
1256{
1257   assert(bld->type.floating);
1258   return lp_build_sub(bld, a, lp_build_floor(bld, a));
1259}
1260
1261
1262/**
1263 * Return the integer part of a float (vector) value (== round toward zero).
1264 * The returned value is an integer (vector).
1265 * Ex: itrunc(-1.5) = -1
1266 */
1267LLVMValueRef
1268lp_build_itrunc(struct lp_build_context *bld,
1269                LLVMValueRef a)
1270{
1271   LLVMBuilderRef builder = bld->gallivm->builder;
1272   const struct lp_type type = bld->type;
1273   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1274
1275   assert(type.floating);
1276   assert(lp_check_value(type, a));
1277
1278   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1279}
1280
1281
1282/**
1283 * Return float (vector) rounded to nearest integer (vector).  The returned
1284 * value is an integer (vector).
1285 * Ex: iround(0.9) = 1
1286 * Ex: iround(-1.5) = -2
1287 */
1288LLVMValueRef
1289lp_build_iround(struct lp_build_context *bld,
1290                LLVMValueRef a)
1291{
1292   LLVMBuilderRef builder = bld->gallivm->builder;
1293   const struct lp_type type = bld->type;
1294   LLVMTypeRef int_vec_type = bld->int_vec_type;
1295   LLVMValueRef res;
1296
1297   assert(type.floating);
1298
1299   assert(lp_check_value(type, a));
1300
1301   if (util_cpu_caps.has_sse2 &&
1302       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1303      return lp_build_iround_nearest_sse2(bld, a);
1304   }
1305   else if (util_cpu_caps.has_sse4_1 &&
1306       (type.length == 1 || type.width*type.length == 128)) {
1307      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1308   }
1309   else {
1310      LLVMValueRef half;
1311
1312      half = lp_build_const_vec(bld->gallivm, type, 0.5);
1313
1314      if (type.sign) {
1315         LLVMTypeRef vec_type = bld->vec_type;
1316         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1317                                    (unsigned long long)1 << (type.width - 1));
1318         LLVMValueRef sign;
1319
1320         /* get sign bit */
1321         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1322         sign = LLVMBuildAnd(builder, sign, mask, "");
1323
1324         /* sign * 0.5 */
1325         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1326         half = LLVMBuildOr(builder, sign, half, "");
1327         half = LLVMBuildBitCast(builder, half, vec_type, "");
1328      }
1329
1330      res = LLVMBuildFAdd(builder, a, half, "");
1331   }
1332
1333   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1334
1335   return res;
1336}
1337
1338
1339/**
1340 * Return floor of float (vector), result is an int (vector)
1341 * Ex: ifloor(1.1) = 1.0
1342 * Ex: ifloor(-1.1) = -2.0
1343 */
1344LLVMValueRef
1345lp_build_ifloor(struct lp_build_context *bld,
1346                LLVMValueRef a)
1347{
1348   LLVMBuilderRef builder = bld->gallivm->builder;
1349   const struct lp_type type = bld->type;
1350   LLVMTypeRef int_vec_type = bld->int_vec_type;
1351   LLVMValueRef res;
1352
1353   assert(type.floating);
1354   assert(lp_check_value(type, a));
1355
1356   if (util_cpu_caps.has_sse4_1 &&
1357       (type.length == 1 || type.width*type.length == 128)) {
1358      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1359   }
1360   else {
1361      res = a;
1362
1363      if (type.sign) {
1364         /* Take the sign bit and add it to 1 constant */
1365         LLVMTypeRef vec_type = bld->vec_type;
1366         unsigned mantissa = lp_mantissa(type);
1367         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1368                                  (unsigned long long)1 << (type.width - 1));
1369         LLVMValueRef sign;
1370         LLVMValueRef offset;
1371
1372         /* sign = a < 0 ? ~0 : 0 */
1373         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1374         sign = LLVMBuildAnd(builder, sign, mask, "");
1375         sign = LLVMBuildAShr(builder, sign,
1376                              lp_build_const_int_vec(bld->gallivm, type,
1377                                                     type.width - 1),
1378                              "ifloor.sign");
1379
1380         /* offset = -0.99999(9)f */
1381         offset = lp_build_const_vec(bld->gallivm, type,
1382                                     -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1383         offset = LLVMConstBitCast(offset, int_vec_type);
1384
1385         /* offset = a < 0 ? offset : 0.0f */
1386         offset = LLVMBuildAnd(builder, offset, sign, "");
1387         offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1388
1389         res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1390      }
1391   }
1392
1393   /* round to nearest (toward zero) */
1394   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1395
1396   return res;
1397}
1398
1399
1400/**
1401 * Return ceiling of float (vector), returning int (vector).
1402 * Ex: iceil( 1.1) = 2
1403 * Ex: iceil(-1.1) = -1
1404 */
1405LLVMValueRef
1406lp_build_iceil(struct lp_build_context *bld,
1407               LLVMValueRef a)
1408{
1409   LLVMBuilderRef builder = bld->gallivm->builder;
1410   const struct lp_type type = bld->type;
1411   LLVMTypeRef int_vec_type = bld->int_vec_type;
1412   LLVMValueRef res;
1413
1414   assert(type.floating);
1415   assert(lp_check_value(type, a));
1416
1417   if (util_cpu_caps.has_sse4_1 &&
1418       (type.length == 1 || type.width*type.length == 128)) {
1419      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1420   }
1421   else {
1422      LLVMTypeRef vec_type = bld->vec_type;
1423      unsigned mantissa = lp_mantissa(type);
1424      LLVMValueRef offset;
1425
1426      /* offset = 0.99999(9)f */
1427      offset = lp_build_const_vec(bld->gallivm, type,
1428                                  (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1429
1430      if (type.sign) {
1431         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1432                                (unsigned long long)1 << (type.width - 1));
1433         LLVMValueRef sign;
1434
1435         /* sign = a < 0 ? 0 : ~0 */
1436         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1437         sign = LLVMBuildAnd(builder, sign, mask, "");
1438         sign = LLVMBuildAShr(builder, sign,
1439                              lp_build_const_int_vec(bld->gallivm, type,
1440                                                     type.width - 1),
1441                              "iceil.sign");
1442         sign = LLVMBuildNot(builder, sign, "iceil.not");
1443
1444         /* offset = a < 0 ? 0.0 : offset */
1445         offset = LLVMConstBitCast(offset, int_vec_type);
1446         offset = LLVMBuildAnd(builder, offset, sign, "");
1447         offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1448      }
1449
1450      res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1451   }
1452
1453   /* round to nearest (toward zero) */
1454   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1455
1456   return res;
1457}
1458
1459
1460/**
1461 * Combined ifloor() & fract().
1462 *
1463 * Preferred to calling the functions separately, as it will ensure that the
1464 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1465 */
1466void
1467lp_build_ifloor_fract(struct lp_build_context *bld,
1468                      LLVMValueRef a,
1469                      LLVMValueRef *out_ipart,
1470                      LLVMValueRef *out_fpart)
1471{
1472   LLVMBuilderRef builder = bld->gallivm->builder;
1473   const struct lp_type type = bld->type;
1474   LLVMValueRef ipart;
1475
1476   assert(type.floating);
1477   assert(lp_check_value(type, a));
1478
1479   if (util_cpu_caps.has_sse4_1 &&
1480       (type.length == 1 || type.width*type.length == 128)) {
1481      /*
1482       * floor() is easier.
1483       */
1484
1485      ipart = lp_build_floor(bld, a);
1486      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1487      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1488   }
1489   else {
1490      /*
1491       * ifloor() is easier.
1492       */
1493
1494      *out_ipart = lp_build_ifloor(bld, a);
1495      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1496      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1497   }
1498}
1499
1500
1501LLVMValueRef
1502lp_build_sqrt(struct lp_build_context *bld,
1503              LLVMValueRef a)
1504{
1505   LLVMBuilderRef builder = bld->gallivm->builder;
1506   const struct lp_type type = bld->type;
1507   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1508   char intrinsic[32];
1509
1510   assert(lp_check_value(type, a));
1511
1512   /* TODO: optimize the constant case */
1513   /* TODO: optimize the constant case */
1514
1515   assert(type.floating);
1516   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1517
1518   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1519}
1520
1521
1522/**
1523 * Do one Newton-Raphson step to improve reciprocate precision:
1524 *
1525 *   x_{i+1} = x_i * (2 - a * x_i)
1526 *
1527 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1528 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1529 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1530 * halo. It would be necessary to clamp the argument to prevent this.
1531 *
1532 * See also:
1533 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1534 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1535 */
1536static INLINE LLVMValueRef
1537lp_build_rcp_refine(struct lp_build_context *bld,
1538                    LLVMValueRef a,
1539                    LLVMValueRef rcp_a)
1540{
1541   LLVMBuilderRef builder = bld->gallivm->builder;
1542   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1543   LLVMValueRef res;
1544
1545   res = LLVMBuildFMul(builder, a, rcp_a, "");
1546   res = LLVMBuildFSub(builder, two, res, "");
1547   res = LLVMBuildFMul(builder, rcp_a, res, "");
1548
1549   return res;
1550}
1551
1552
1553LLVMValueRef
1554lp_build_rcp(struct lp_build_context *bld,
1555             LLVMValueRef a)
1556{
1557   LLVMBuilderRef builder = bld->gallivm->builder;
1558   const struct lp_type type = bld->type;
1559
1560   assert(lp_check_value(type, a));
1561
1562   if(a == bld->zero)
1563      return bld->undef;
1564   if(a == bld->one)
1565      return bld->one;
1566   if(a == bld->undef)
1567      return bld->undef;
1568
1569   assert(type.floating);
1570
1571   if(LLVMIsConstant(a))
1572      return LLVMConstFDiv(bld->one, a);
1573
1574   /*
1575    * We don't use RCPPS because:
1576    * - it only has 10bits of precision
1577    * - it doesn't even get the reciprocate of 1.0 exactly
1578    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1579    * - for recent processors the benefit over DIVPS is marginal, a case
1580    *   depedent
1581    *
1582    * We could still use it on certain processors if benchmarks show that the
1583    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1584    * particular uses that require less workarounds.
1585    */
1586
1587   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1588      const unsigned num_iterations = 0;
1589      LLVMValueRef res;
1590      unsigned i;
1591
1592      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1593
1594      for (i = 0; i < num_iterations; ++i) {
1595         res = lp_build_rcp_refine(bld, a, res);
1596      }
1597
1598      return res;
1599   }
1600
1601   return LLVMBuildFDiv(builder, bld->one, a, "");
1602}
1603
1604
1605/**
1606 * Do one Newton-Raphson step to improve rsqrt precision:
1607 *
1608 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1609 *
1610 * See also:
1611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1612 */
1613static INLINE LLVMValueRef
1614lp_build_rsqrt_refine(struct lp_build_context *bld,
1615                      LLVMValueRef a,
1616                      LLVMValueRef rsqrt_a)
1617{
1618   LLVMBuilderRef builder = bld->gallivm->builder;
1619   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1620   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1621   LLVMValueRef res;
1622
1623   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1624   res = LLVMBuildFMul(builder, a, res, "");
1625   res = LLVMBuildFSub(builder, three, res, "");
1626   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1627   res = LLVMBuildFMul(builder, half, res, "");
1628
1629   return res;
1630}
1631
1632
1633/**
1634 * Generate 1/sqrt(a)
1635 */
1636LLVMValueRef
1637lp_build_rsqrt(struct lp_build_context *bld,
1638               LLVMValueRef a)
1639{
1640   LLVMBuilderRef builder = bld->gallivm->builder;
1641   const struct lp_type type = bld->type;
1642
1643   assert(lp_check_value(type, a));
1644
1645   assert(type.floating);
1646
1647   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1648      const unsigned num_iterations = 1;
1649      LLVMValueRef res;
1650      unsigned i;
1651
1652      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1653
1654      for (i = 0; i < num_iterations; ++i) {
1655         res = lp_build_rsqrt_refine(bld, a, res);
1656      }
1657
1658      return res;
1659   }
1660
1661   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1662}
1663
1664
1665/**
1666 * Generate sin(a) using SSE2
1667 */
1668LLVMValueRef
1669lp_build_sin(struct lp_build_context *bld,
1670             LLVMValueRef a)
1671{
1672   struct gallivm_state *gallivm = bld->gallivm;
1673   LLVMBuilderRef builder = gallivm->builder;
1674   struct lp_type int_type = lp_int_type(bld->type);
1675   LLVMBuilderRef b = builder;
1676
1677   /*
1678    *  take the absolute value,
1679    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1680    */
1681
1682   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1683   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1684
1685   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1686   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1687
1688   /*
1689    * extract the sign bit (upper one)
1690    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1691    */
1692   LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1693   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1694
1695   /*
1696    * scale by 4/Pi
1697    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1698    */
1699
1700   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1701   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1702
1703   /*
1704    * store the integer part of y in mm0
1705    * emm2 = _mm_cvttps_epi32(y);
1706    */
1707
1708   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1709
1710   /*
1711    * j=(j+1) & (~1) (see the cephes sources)
1712    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1713    */
1714
1715   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1716   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1717   /*
1718    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1719    */
1720   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1721   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1722
1723   /*
1724    * y = _mm_cvtepi32_ps(emm2);
1725    */
1726   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1727
1728   /* get the swap sign flag
1729    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1730    */
1731   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1732   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1733
1734   /*
1735    * emm2 = _mm_slli_epi32(emm0, 29);
1736    */
1737   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1738   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1739
1740   /*
1741    * get the polynom selection mask
1742    * there is one polynom for 0 <= x <= Pi/4
1743    * and another one for Pi/4<x<=Pi/2
1744    * Both branches will be computed.
1745    *
1746    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1747    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1748    */
1749
1750   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1751   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1752   LLVMValueRef poly_mask = lp_build_compare(gallivm,
1753                                             int_type, PIPE_FUNC_EQUAL,
1754                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1755   /*
1756    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1757    */
1758   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1759
1760   /*
1761    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1762    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1763    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1764    */
1765   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1766   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1767   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1768
1769   /*
1770    * The magic pass: "Extended precision modular arithmetic"
1771    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1772    * xmm1 = _mm_mul_ps(y, xmm1);
1773    * xmm2 = _mm_mul_ps(y, xmm2);
1774    * xmm3 = _mm_mul_ps(y, xmm3);
1775    */
1776   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1777   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1778   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1779
1780   /*
1781    * x = _mm_add_ps(x, xmm1);
1782    * x = _mm_add_ps(x, xmm2);
1783    * x = _mm_add_ps(x, xmm3);
1784    */
1785
1786   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1787   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1788   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1789
1790   /*
1791    * Evaluate the first polynom  (0 <= x <= Pi/4)
1792    *
1793    * z = _mm_mul_ps(x,x);
1794    */
1795   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1796
1797   /*
1798    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1799    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1800    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1801    */
1802   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1803   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1804   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1805
1806   /*
1807    * y = *(v4sf*)_ps_coscof_p0;
1808    * y = _mm_mul_ps(y, z);
1809    */
1810   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1811   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1812   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1813   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1814   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1815   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1816
1817
1818   /*
1819    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1820    * y = _mm_sub_ps(y, tmp);
1821    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1822    */
1823   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1824   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1825   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1826   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1827   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1828
1829   /*
1830    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1831    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1832    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1833    */
1834   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1835   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1836   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1837
1838   /*
1839    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1840    *
1841    * y2 = *(v4sf*)_ps_sincof_p0;
1842    * y2 = _mm_mul_ps(y2, z);
1843    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1844    * y2 = _mm_mul_ps(y2, z);
1845    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1846    * y2 = _mm_mul_ps(y2, z);
1847    * y2 = _mm_mul_ps(y2, x);
1848    * y2 = _mm_add_ps(y2, x);
1849    */
1850
1851   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1852   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1853   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1854   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1855   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1856   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1857   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1858
1859   /*
1860    * select the correct result from the two polynoms
1861    * xmm3 = poly_mask;
1862    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1863    * y = _mm_andnot_ps(xmm3, y);
1864    * y = _mm_add_ps(y,y2);
1865    */
1866   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1867   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1868   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1869   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1870   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1871   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1872   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1873
1874   /*
1875    * update the sign
1876    * y = _mm_xor_ps(y, sign_bit);
1877    */
1878   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1879   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1880   return y_result;
1881}
1882
1883
1884/**
1885 * Generate cos(a) using SSE2
1886 */
1887LLVMValueRef
1888lp_build_cos(struct lp_build_context *bld,
1889             LLVMValueRef a)
1890{
1891   struct gallivm_state *gallivm = bld->gallivm;
1892   LLVMBuilderRef builder = gallivm->builder;
1893   struct lp_type int_type = lp_int_type(bld->type);
1894   LLVMBuilderRef b = builder;
1895
1896   /*
1897    *  take the absolute value,
1898    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1899    */
1900
1901   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1902   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1903
1904   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1905   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1906
1907   /*
1908    * scale by 4/Pi
1909    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1910    */
1911
1912   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1913   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1914
1915   /*
1916    * store the integer part of y in mm0
1917    * emm2 = _mm_cvttps_epi32(y);
1918    */
1919
1920   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1921
1922   /*
1923    * j=(j+1) & (~1) (see the cephes sources)
1924    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1925    */
1926
1927   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1928   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1929   /*
1930    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1931    */
1932   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1933   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1934
1935   /*
1936    * y = _mm_cvtepi32_ps(emm2);
1937    */
1938   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1939
1940
1941   /*
1942    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1943    */
1944   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1945   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1946
1947
1948   /* get the swap sign flag
1949    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1950    */
1951   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1952   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1953   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1954   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1955
1956   /*
1957    * emm2 = _mm_slli_epi32(emm0, 29);
1958    */
1959   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1960   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1961
1962   /*
1963    * get the polynom selection mask
1964    * there is one polynom for 0 <= x <= Pi/4
1965    * and another one for Pi/4<x<=Pi/2
1966    * Both branches will be computed.
1967    *
1968    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1969    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1970    */
1971
1972   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1973   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1974   LLVMValueRef poly_mask = lp_build_compare(gallivm,
1975                                             int_type, PIPE_FUNC_EQUAL,
1976   				             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1977
1978   /*
1979    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1980    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1981    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1982    */
1983   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1984   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1985   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1986
1987   /*
1988    * The magic pass: "Extended precision modular arithmetic"
1989    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1990    * xmm1 = _mm_mul_ps(y, xmm1);
1991    * xmm2 = _mm_mul_ps(y, xmm2);
1992    * xmm3 = _mm_mul_ps(y, xmm3);
1993    */
1994   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1995   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1996   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1997
1998   /*
1999    * x = _mm_add_ps(x, xmm1);
2000    * x = _mm_add_ps(x, xmm2);
2001    * x = _mm_add_ps(x, xmm3);
2002    */
2003
2004   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2005   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2006   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2007
2008   /*
2009    * Evaluate the first polynom  (0 <= x <= Pi/4)
2010    *
2011    * z = _mm_mul_ps(x,x);
2012    */
2013   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2014
2015   /*
2016    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2017    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2018    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2019    */
2020   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2021   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2022   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2023
2024   /*
2025    * y = *(v4sf*)_ps_coscof_p0;
2026    * y = _mm_mul_ps(y, z);
2027    */
2028   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2029   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2030   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2031   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2032   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2033   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2034
2035
2036   /*
2037    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2038    * y = _mm_sub_ps(y, tmp);
2039    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2040    */
2041   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2042   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2043   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2044   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2045   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2046
2047   /*
2048    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2049    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2050    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2051    */
2052   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2053   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2054   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2055
2056   /*
2057    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2058    *
2059    * y2 = *(v4sf*)_ps_sincof_p0;
2060    * y2 = _mm_mul_ps(y2, z);
2061    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2062    * y2 = _mm_mul_ps(y2, z);
2063    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2064    * y2 = _mm_mul_ps(y2, z);
2065    * y2 = _mm_mul_ps(y2, x);
2066    * y2 = _mm_add_ps(y2, x);
2067    */
2068
2069   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2070   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2071   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2072   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2073   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2074   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2075   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2076
2077   /*
2078    * select the correct result from the two polynoms
2079    * xmm3 = poly_mask;
2080    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2081    * y = _mm_andnot_ps(xmm3, y);
2082    * y = _mm_add_ps(y,y2);
2083    */
2084   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2085   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2086   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2087   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2088   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2089   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2090
2091   /*
2092    * update the sign
2093    * y = _mm_xor_ps(y, sign_bit);
2094    */
2095   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2096   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2097   return y_result;
2098}
2099
2100
2101/**
2102 * Generate pow(x, y)
2103 */
2104LLVMValueRef
2105lp_build_pow(struct lp_build_context *bld,
2106             LLVMValueRef x,
2107             LLVMValueRef y)
2108{
2109   /* TODO: optimize the constant case */
2110   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2111       LLVMIsConstant(x) && LLVMIsConstant(y)) {
2112      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2113                   __FUNCTION__);
2114   }
2115
2116   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2117}
2118
2119
2120/**
2121 * Generate exp(x)
2122 */
2123LLVMValueRef
2124lp_build_exp(struct lp_build_context *bld,
2125             LLVMValueRef x)
2126{
2127   /* log2(e) = 1/log(2) */
2128   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2129                                           1.4426950408889634);
2130
2131   assert(lp_check_value(bld->type, x));
2132
2133   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2134}
2135
2136
2137/**
2138 * Generate log(x)
2139 */
2140LLVMValueRef
2141lp_build_log(struct lp_build_context *bld,
2142             LLVMValueRef x)
2143{
2144   /* log(2) */
2145   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2146                                          0.69314718055994529);
2147
2148   assert(lp_check_value(bld->type, x));
2149
2150   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2151}
2152
2153
2154/**
2155 * Generate polynomial.
2156 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2157 */
2158static LLVMValueRef
2159lp_build_polynomial(struct lp_build_context *bld,
2160                    LLVMValueRef x,
2161                    const double *coeffs,
2162                    unsigned num_coeffs)
2163{
2164   const struct lp_type type = bld->type;
2165   LLVMValueRef res = NULL;
2166   unsigned i;
2167
2168   assert(lp_check_value(bld->type, x));
2169
2170   /* TODO: optimize the constant case */
2171   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2172       LLVMIsConstant(x)) {
2173      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2174                   __FUNCTION__);
2175   }
2176
2177   for (i = num_coeffs; i--; ) {
2178      LLVMValueRef coeff;
2179
2180      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2181
2182      if(res)
2183         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2184      else
2185         res = coeff;
2186   }
2187
2188   if(res)
2189      return res;
2190   else
2191      return bld->undef;
2192}
2193
2194
2195/**
2196 * Minimax polynomial fit of 2**x, in range [0, 1[
2197 */
2198const double lp_build_exp2_polynomial[] = {
2199#if EXP_POLY_DEGREE == 5
2200   0.999999925063526176901,
2201   0.693153073200168932794,
2202   0.240153617044375388211,
2203   0.0558263180532956664775,
2204   0.00898934009049466391101,
2205   0.00187757667519147912699
2206#elif EXP_POLY_DEGREE == 4
2207   1.00000259337069434683,
2208   0.693003834469974940458,
2209   0.24144275689150793076,
2210   0.0520114606103070150235,
2211   0.0135341679161270268764
2212#elif EXP_POLY_DEGREE == 3
2213   0.999925218562710312959,
2214   0.695833540494823811697,
2215   0.226067155427249155588,
2216   0.0780245226406372992967
2217#elif EXP_POLY_DEGREE == 2
2218   1.00172476321474503578,
2219   0.657636275736077639316,
2220   0.33718943461968720704
2221#else
2222#error
2223#endif
2224};
2225
2226
2227void
2228lp_build_exp2_approx(struct lp_build_context *bld,
2229                     LLVMValueRef x,
2230                     LLVMValueRef *p_exp2_int_part,
2231                     LLVMValueRef *p_frac_part,
2232                     LLVMValueRef *p_exp2)
2233{
2234   LLVMBuilderRef builder = bld->gallivm->builder;
2235   const struct lp_type type = bld->type;
2236   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2237   LLVMValueRef ipart = NULL;
2238   LLVMValueRef fpart = NULL;
2239   LLVMValueRef expipart = NULL;
2240   LLVMValueRef expfpart = NULL;
2241   LLVMValueRef res = NULL;
2242
2243   assert(lp_check_value(bld->type, x));
2244
2245   if(p_exp2_int_part || p_frac_part || p_exp2) {
2246      /* TODO: optimize the constant case */
2247      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2248          LLVMIsConstant(x)) {
2249         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2250                      __FUNCTION__);
2251      }
2252
2253      assert(type.floating && type.width == 32);
2254
2255      x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2256      x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2257
2258      /* ipart = floor(x) */
2259      /* fpart = x - ipart */
2260      lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2261   }
2262
2263   if(p_exp2_int_part || p_exp2) {
2264      /* expipart = (float) (1 << ipart) */
2265      expipart = LLVMBuildAdd(builder, ipart,
2266                              lp_build_const_int_vec(bld->gallivm, type, 127), "");
2267      expipart = LLVMBuildShl(builder, expipart,
2268                              lp_build_const_int_vec(bld->gallivm, type, 23), "");
2269      expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2270   }
2271
2272   if(p_exp2) {
2273      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2274                                     Elements(lp_build_exp2_polynomial));
2275
2276      res = LLVMBuildFMul(builder, expipart, expfpart, "");
2277   }
2278
2279   if(p_exp2_int_part)
2280      *p_exp2_int_part = expipart;
2281
2282   if(p_frac_part)
2283      *p_frac_part = fpart;
2284
2285   if(p_exp2)
2286      *p_exp2 = res;
2287}
2288
2289
2290LLVMValueRef
2291lp_build_exp2(struct lp_build_context *bld,
2292              LLVMValueRef x)
2293{
2294   LLVMValueRef res;
2295   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2296   return res;
2297}
2298
2299
2300/**
2301 * Extract the exponent of a IEEE-754 floating point value.
2302 *
2303 * Optionally apply an integer bias.
2304 *
2305 * Result is an integer value with
2306 *
2307 *   ifloor(log2(x)) + bias
2308 */
2309LLVMValueRef
2310lp_build_extract_exponent(struct lp_build_context *bld,
2311                          LLVMValueRef x,
2312                          int bias)
2313{
2314   LLVMBuilderRef builder = bld->gallivm->builder;
2315   const struct lp_type type = bld->type;
2316   unsigned mantissa = lp_mantissa(type);
2317   LLVMValueRef res;
2318
2319   assert(type.floating);
2320
2321   assert(lp_check_value(bld->type, x));
2322
2323   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2324
2325   res = LLVMBuildLShr(builder, x,
2326                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2327   res = LLVMBuildAnd(builder, res,
2328                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
2329   res = LLVMBuildSub(builder, res,
2330                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2331
2332   return res;
2333}
2334
2335
2336/**
2337 * Extract the mantissa of the a floating.
2338 *
2339 * Result is a floating point value with
2340 *
2341 *   x / floor(log2(x))
2342 */
2343LLVMValueRef
2344lp_build_extract_mantissa(struct lp_build_context *bld,
2345                          LLVMValueRef x)
2346{
2347   LLVMBuilderRef builder = bld->gallivm->builder;
2348   const struct lp_type type = bld->type;
2349   unsigned mantissa = lp_mantissa(type);
2350   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2351                                                  (1ULL << mantissa) - 1);
2352   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2353   LLVMValueRef res;
2354
2355   assert(lp_check_value(bld->type, x));
2356
2357   assert(type.floating);
2358
2359   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2360
2361   /* res = x / 2**ipart */
2362   res = LLVMBuildAnd(builder, x, mantmask, "");
2363   res = LLVMBuildOr(builder, res, one, "");
2364   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2365
2366   return res;
2367}
2368
2369
2370
2371/**
2372 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2373 * These coefficients can be generate with
2374 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2375 */
2376const double lp_build_log2_polynomial[] = {
2377#if LOG_POLY_DEGREE == 6
2378   3.11578814719469302614,
2379   -3.32419399085241980044,
2380   2.59883907202499966007,
2381   -1.23152682416275988241,
2382   0.318212422185251071475,
2383   -0.0344359067839062357313
2384#elif LOG_POLY_DEGREE == 5
2385   2.8882704548164776201,
2386   -2.52074962577807006663,
2387   1.48116647521213171641,
2388   -0.465725644288844778798,
2389   0.0596515482674574969533
2390#elif LOG_POLY_DEGREE == 4
2391   2.61761038894603480148,
2392   -1.75647175389045657003,
2393   0.688243882994381274313,
2394   -0.107254423828329604454
2395#elif LOG_POLY_DEGREE == 3
2396   2.28330284476918490682,
2397   -1.04913055217340124191,
2398   0.204446009836232697516
2399#else
2400#error
2401#endif
2402};
2403
2404
2405/**
2406 * See http://www.devmaster.net/forums/showthread.php?p=43580
2407 */
2408void
2409lp_build_log2_approx(struct lp_build_context *bld,
2410                     LLVMValueRef x,
2411                     LLVMValueRef *p_exp,
2412                     LLVMValueRef *p_floor_log2,
2413                     LLVMValueRef *p_log2)
2414{
2415   LLVMBuilderRef builder = bld->gallivm->builder;
2416   const struct lp_type type = bld->type;
2417   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2418   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2419
2420   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2421   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2422   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2423
2424   LLVMValueRef i = NULL;
2425   LLVMValueRef exp = NULL;
2426   LLVMValueRef mant = NULL;
2427   LLVMValueRef logexp = NULL;
2428   LLVMValueRef logmant = NULL;
2429   LLVMValueRef res = NULL;
2430
2431   assert(lp_check_value(bld->type, x));
2432
2433   if(p_exp || p_floor_log2 || p_log2) {
2434      /* TODO: optimize the constant case */
2435      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2436          LLVMIsConstant(x)) {
2437         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2438                      __FUNCTION__);
2439      }
2440
2441      assert(type.floating && type.width == 32);
2442
2443      /*
2444       * We don't explicitly handle denormalized numbers. They will yield a
2445       * result in the neighbourhood of -127, which appears to be adequate
2446       * enough.
2447       */
2448
2449      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2450
2451      /* exp = (float) exponent(x) */
2452      exp = LLVMBuildAnd(builder, i, expmask, "");
2453   }
2454
2455   if(p_floor_log2 || p_log2) {
2456      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2457      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2458      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2459   }
2460
2461   if(p_log2) {
2462      /* mant = (float) mantissa(x) */
2463      mant = LLVMBuildAnd(builder, i, mantmask, "");
2464      mant = LLVMBuildOr(builder, mant, one, "");
2465      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2466
2467      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2468                                    Elements(lp_build_log2_polynomial));
2469
2470      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2471      logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2472
2473      res = LLVMBuildFAdd(builder, logmant, logexp, "");
2474   }
2475
2476   if(p_exp) {
2477      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2478      *p_exp = exp;
2479   }
2480
2481   if(p_floor_log2)
2482      *p_floor_log2 = logexp;
2483
2484   if(p_log2)
2485      *p_log2 = res;
2486}
2487
2488
2489LLVMValueRef
2490lp_build_log2(struct lp_build_context *bld,
2491              LLVMValueRef x)
2492{
2493   LLVMValueRef res;
2494   lp_build_log2_approx(bld, x, NULL, NULL, &res);
2495   return res;
2496}
2497
2498
2499/**
2500 * Faster (and less accurate) log2.
2501 *
2502 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2503 *
2504 * Piece-wise linear approximation, with exact results when x is a
2505 * power of two.
2506 *
2507 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2508 */
2509LLVMValueRef
2510lp_build_fast_log2(struct lp_build_context *bld,
2511                   LLVMValueRef x)
2512{
2513   LLVMBuilderRef builder = bld->gallivm->builder;
2514   LLVMValueRef ipart;
2515   LLVMValueRef fpart;
2516
2517   assert(lp_check_value(bld->type, x));
2518
2519   assert(bld->type.floating);
2520
2521   /* ipart = floor(log2(x)) - 1 */
2522   ipart = lp_build_extract_exponent(bld, x, -1);
2523   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2524
2525   /* fpart = x / 2**ipart */
2526   fpart = lp_build_extract_mantissa(bld, x);
2527
2528   /* ipart + fpart */
2529   return LLVMBuildFAdd(builder, ipart, fpart, "");
2530}
2531
2532
2533/**
2534 * Fast implementation of iround(log2(x)).
2535 *
2536 * Not an approximation -- it should give accurate results all the time.
2537 */
2538LLVMValueRef
2539lp_build_ilog2(struct lp_build_context *bld,
2540               LLVMValueRef x)
2541{
2542   LLVMBuilderRef builder = bld->gallivm->builder;
2543   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2544   LLVMValueRef ipart;
2545
2546   assert(bld->type.floating);
2547
2548   assert(lp_check_value(bld->type, x));
2549
2550   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2551   x = LLVMBuildFMul(builder, x, sqrt2, "");
2552
2553   /* ipart = floor(log2(x) + 0.5)  */
2554   ipart = lp_build_extract_exponent(bld, x, 0);
2555
2556   return ipart;
2557}
2558