1f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/************************************************************************** 2f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 3f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Copyright 2009-2010 VMware, Inc. 4f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * All Rights Reserved. 5f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 6f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Permission is hereby granted, free of charge, to any person obtaining a 7f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * copy of this software and associated documentation files (the 8f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * "Software"), to deal in the Software without restriction, including 9f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * without limitation the rights to use, copy, modify, merge, publish, 10f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * distribute, sub license, and/or sell copies of the Software, and to 11f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * permit persons to whom the Software is furnished to do so, subject to 12f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * the following conditions: 13f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 14f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The above copyright notice and this permission notice (including the 15f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * next paragraph) shall be included in all copies or substantial portions 16f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * of the Software. 17f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 18f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 26f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org **************************************************************************/ 27f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 28f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 29f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 30f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * @file 31f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Helper 32f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 33f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * LLVM IR doesn't support all basic arithmetic operations we care about (most 34f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * notably min/max and saturated operations), and it is often necessary to 35f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * resort machine-specific intrinsics directly. The functions here hide all 36f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * these implementation details from the other modules. 37f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 38f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * We also do simple expressions simplification here. Reasons are: 39f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - it is very easy given we have all necessary information readily available 40f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - LLVM optimization passes fail to simplify several vector expressions 41f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - We often know value constraints which the optimization passes have no way 42f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * of knowing, such as when source arguments are known to be in [0, 1] range. 43f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 44f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * @author Jose Fonseca <jfonseca@vmware.com> 45f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 46f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 47f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 48f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_memory.h" 49f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_debug.h" 50f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_math.h" 51f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_string.h" 52f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_cpu_detect.h" 53f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 54f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_type.h" 55f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_const.h" 56f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_init.h" 57f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_intr.h" 58f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_logic.h" 59f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_pack.h" 60f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_debug.h" 61f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_bld_arit.h" 62f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 63f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 64f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define EXP_POLY_DEGREE 5 65f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 66f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define LOG_POLY_DEGREE 4 67f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 68f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 69f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 70f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate min(a, b) 71f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * No checks for special case values of a or b = 1 or 0 are done. 72f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 73f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic LLVMValueRef 74f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_min_simple(struct lp_build_context *bld, 75f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 76f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 77f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 78f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 79f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 80f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned intr_size = 0; 81f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef cond; 82f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 83f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 84f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, b)); 85f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 86f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 87f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 88f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating && util_cpu_caps.has_sse) { 89f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 32) { 90f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 91f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.min.ss"; 92f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 93f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 94f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.length <= 4 || !util_cpu_caps.has_avx) { 95f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.min.ps"; 96f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 97f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 98f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 99f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.min.ps.256"; 100f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 256; 101f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 102f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 103f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 64 && util_cpu_caps.has_sse2) { 104f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 105f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.min.sd"; 106f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 107f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 108f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.length == 2 || !util_cpu_caps.has_avx) { 109f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.min.pd"; 110f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 111f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 112f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 113f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.min.pd.256"; 114f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 256; 115f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 116f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 117f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 118f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (util_cpu_caps.has_sse2 && type.length >= 2) { 119f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 120f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if ((type.width == 8 || type.width == 16) && 121f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (type.width * type.length <= 64) && 122f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (gallivm_debug & GALLIVM_DEBUG_PERF)) { 123f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 124f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 125f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 126f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 8 && !type.sign) { 127f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.pminu.b"; 128f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 129f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.width == 16 && type.sign) { 130f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.pmins.w"; 131f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 132f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (util_cpu_caps.has_sse4_1) { 133f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 8 && type.sign) { 134f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pminsb"; 135f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 136f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 16 && !type.sign) { 137f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pminuw"; 138f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 139f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 32 && !type.sign) { 140f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pminud"; 141f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 142f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 32 && type.sign) { 143f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pminsd"; 144f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 145f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 146f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 147f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 148f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(intrinsic) { 149f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 150f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type, 151f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size, a, b); 152f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 153f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 154f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 155f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_select(bld, cond, a, b); 156f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 157f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 158f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 159f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 160f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate max(a, b) 161f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * No checks for special case values of a or b = 1 or 0 are done. 162f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 163f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic LLVMValueRef 164f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_max_simple(struct lp_build_context *bld, 165f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 166f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 167f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 168f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 169f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 170f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned intr_size = 0; 171f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef cond; 172f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 173f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 174f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, b)); 175f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 176f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 177f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 178f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating && util_cpu_caps.has_sse) { 179f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 32) { 180f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 181f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.max.ss"; 182f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 183f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 184f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.length <= 4 || !util_cpu_caps.has_avx) { 185f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.max.ps"; 186f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 187f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 188f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 189f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.max.ps.256"; 190f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 256; 191f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 192f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 193f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 64 && util_cpu_caps.has_sse2) { 194f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 195f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.max.sd"; 196f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 197f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 198f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.length == 2 || !util_cpu_caps.has_avx) { 199f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.max.pd"; 200f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 201f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 202f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 203f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.max.pd.256"; 204f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 256; 205f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 206f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 207f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 208f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (util_cpu_caps.has_sse2 && type.length >= 2) { 209f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 210f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if ((type.width == 8 || type.width == 16) && 211f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (type.width * type.length <= 64) && 212f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (gallivm_debug & GALLIVM_DEBUG_PERF)) { 213f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 214f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 215f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 216f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 8 && !type.sign) { 217f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.pmaxu.b"; 218f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size = 128; 219f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 220f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.width == 16 && type.sign) { 221f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.pmaxs.w"; 222f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 223f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (util_cpu_caps.has_sse4_1) { 224f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 8 && type.sign) { 225f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pmaxsb"; 226f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 227f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 16 && !type.sign) { 228f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pmaxuw"; 229f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 230f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 32 && !type.sign) { 231f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pmaxud"; 232f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 233f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width == 32 && type.sign) { 234f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.pmaxsd"; 235f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 236f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 237f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 238f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 239f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(intrinsic) { 240f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 241f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type, 242f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intr_size, a, b); 243f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 244f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 245f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 246f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_select(bld, cond, a, b); 247f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 248f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 249f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 250f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 251f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate 1 - a, or ~a depending on bld->type. 252f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 253f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 254f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_comp(struct lp_build_context *bld, 255f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 256f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 257f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 258f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 259f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 260f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 261f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 262f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one) 263f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 264f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->zero) 265f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->one; 266f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 267f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.norm && !type.floating && !type.fixed && !type.sign) { 268f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a)) 269f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstNot(a); 270f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 271f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildNot(builder, a, ""); 272f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 273f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 274f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a)) 275f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 276f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstFSub(bld->one, a); 277f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 278f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstSub(bld->one, a); 279f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 280f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 281f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildFSub(builder, bld->one, a, ""); 282f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 283f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildSub(builder, bld->one, a, ""); 284f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 285f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 286f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 287f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 288f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate a + b 289f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 290f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 291f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_add(struct lp_build_context *bld, 292f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 293f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 294f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 295f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 296f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 297f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 298f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 299f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 300f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, b)); 301f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 302f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->zero) 303f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return b; 304f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->zero) 305f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 306f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef || b == bld->undef) 307f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 308f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 309f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(bld->type.norm) { 310f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 311f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 312f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one || b == bld->one) 313f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->one; 314f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 315f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(util_cpu_caps.has_sse2 && 316f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type.width * type.length == 128 && 317f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org !type.floating && !type.fixed) { 318f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.width == 8) 319f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 320f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.width == 16) 321f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 322f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 323f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 324f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(intrinsic) 325f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 326f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 327f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 328f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a) && LLVMIsConstant(b)) 329f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 330f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstFAdd(a, b); 331f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 332f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstAdd(a, b); 333f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 334f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 335f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFAdd(builder, a, b, ""); 336f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 337f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildAdd(builder, a, b, ""); 338f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 339f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* clamp to ceiling of 1.0 */ 340f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 341f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_min_simple(bld, res, bld->one); 342f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 343f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* XXX clamp to floor of -1 or 0??? */ 344f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 345f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 346f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 347f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 348f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 349f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** Return the scalar sum of the elements of a. 350f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Should avoid this operation whenever possible. 351f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 352f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 353f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_horizontal_add(struct lp_build_context *bld, 354f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 355f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 356f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 357f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 358f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef index, res; 359f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned i, length; 360f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 361f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 362f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef vecres, elem2; 363f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 364f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 365f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 366f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 367f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 368f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 369f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 370f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(!bld->type.norm); 371f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 372f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 373f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * for byte vectors can do much better with psadbw. 374f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Using repeated shuffle/adds here. Note with multiple vectors 375f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * this can be done more efficiently as outlined in the intel 376f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * optimization manual. 377f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Note: could cause data rearrangement if used with smaller element 378f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * sizes. 379f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 380f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 381f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vecres = a; 382f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org length = type.length / 2; 383f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org while (length > 1) { 384f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef vec1, vec2; 385f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = 0; i < length; i++) { 386f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 387f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 388f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 389f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 390f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles1, length), ""); 391f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 392f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles2, length), ""); 393f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) { 394f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 395f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 396f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 397f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 398f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 399f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org length = length >> 1; 400f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 401f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 402f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* always have vector of size 2 here */ 403f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(length == 1); 404f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 405f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org index = lp_build_const_int32(bld->gallivm, 0); 406f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildExtractElement(builder, vecres, index, ""); 407f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org index = lp_build_const_int32(bld->gallivm, 1); 408f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 409f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 410f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 411f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFAdd(builder, res, elem2, ""); 412f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 413f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildAdd(builder, res, elem2, ""); 414f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 415f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 416f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 417f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 418f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 419f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return the horizontal sums of 4 float vectors as a float4 vector. 420f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * This uses the technique as outlined in Intel Optimization Manual. 421f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 422f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic LLVMValueRef 423f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_horizontal_add4x4f(struct lp_build_context *bld, 424f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef src[4]) 425f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 426f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct gallivm_state *gallivm = bld->gallivm; 427f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = gallivm->builder; 428f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef shuffles[4]; 429f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef tmp[4]; 430f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sumtmp[2], shuftmp[2]; 431f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 432f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* lower half of regs */ 433f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[0] = lp_build_const_int32(gallivm, 0); 434f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[1] = lp_build_const_int32(gallivm, 1); 435f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[2] = lp_build_const_int32(gallivm, 4); 436f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[3] = lp_build_const_int32(gallivm, 5); 437f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 438f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles, 4), ""); 439f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 440f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles, 4), ""); 441f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 442f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* upper half of regs */ 443f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[0] = lp_build_const_int32(gallivm, 2); 444f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[1] = lp_build_const_int32(gallivm, 3); 445f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[2] = lp_build_const_int32(gallivm, 6); 446f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[3] = lp_build_const_int32(gallivm, 7); 447f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 448f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles, 4), ""); 449f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 450f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles, 4), ""); 451f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 452f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 453f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 454f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 455f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[0] = lp_build_const_int32(gallivm, 0); 456f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[1] = lp_build_const_int32(gallivm, 2); 457f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[2] = lp_build_const_int32(gallivm, 4); 458f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[3] = lp_build_const_int32(gallivm, 6); 459f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 460f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles, 4), ""); 461f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 462f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[0] = lp_build_const_int32(gallivm, 1); 463f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[1] = lp_build_const_int32(gallivm, 3); 464f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[2] = lp_build_const_int32(gallivm, 5); 465f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuffles[3] = lp_build_const_int32(gallivm, 7); 466f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 467f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstVector(shuffles, 4), ""); 468f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 469f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 470f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 471f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 472f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 473f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/* 474f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * partially horizontally add 2-4 float vectors with length nx4, 475f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * i.e. only four adjacent values in each vector will be added, 476f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * assuming values are really grouped in 4 which also determines 477f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * output order. 478f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 479f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return a vector of the same length as the initial vectors, 480f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * with the excess elements (if any) being undefined. 481f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The element order is independent of number of input vectors. 482f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 483f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * the output order thus will be 484f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 485f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 486f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 487f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_hadd_partial4(struct lp_build_context *bld, 488f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef vectors[], 489f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned num_vecs) 490f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 491f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct gallivm_state *gallivm = bld->gallivm; 492f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = gallivm->builder; 493f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef ret_vec; 494f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef tmp[4]; 495f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 496f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 497f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(num_vecs >= 2 && num_vecs <= 4); 498f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(bld->type.floating); 499f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 500f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* only use this with at least 2 vectors, as it is sort of expensive 501f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * (depending on cpu) and we always need two horizontal adds anyway, 502f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * so a shuffle/add approach might be better. 503f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 504f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 505f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[0] = vectors[0]; 506f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[1] = vectors[1]; 507f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 508f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 509f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 510f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 511f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (util_cpu_caps.has_sse3 && bld->type.width == 32 && 512f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org bld->type.length == 4) { 513f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse3.hadd.ps"; 514f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 515f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (util_cpu_caps.has_avx && bld->type.width == 32 && 516f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org bld->type.length == 8) { 517f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.hadd.ps.256"; 518f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 519f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (intrinsic) { 520f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 521f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_vec_type(gallivm, bld->type), 522f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[0], tmp[1]); 523f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (num_vecs > 2) { 524f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 525f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_vec_type(gallivm, bld->type), 526f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[2], tmp[3]); 527f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 528f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 529f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[1] = tmp[0]; 530f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 531f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_binary(builder, intrinsic, 532f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_vec_type(gallivm, bld->type), 533f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org tmp[0], tmp[1]); 534f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 535f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 536f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (bld->type.length == 4) { 537f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 538f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 539f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 540f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 541f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned j; 542f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned num_iter = bld->type.length / 4; 543f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_type parttype = bld->type; 544f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org parttype.length = 4; 545f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (j = 0; j < num_iter; j++) { 546f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef partsrc[4]; 547f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned i; 548f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = 0; i < 4; i++) { 549f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 550f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 551f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 552f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 553f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 554f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 555f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return ret_vec; 556f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 557f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 558f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 559f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate a - b 560f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 561f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 562f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_sub(struct lp_build_context *bld, 563f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 564f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 565f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 566f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 567f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 568f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 569f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 570f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 571f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, b)); 572f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 573f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->zero) 574f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 575f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef || b == bld->undef) 576f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 577f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == b) 578f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 579f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 580f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(bld->type.norm) { 581f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 582f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 583f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->one) 584f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 585f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 586f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(util_cpu_caps.has_sse2 && 587f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type.width * type.length == 128 && 588f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org !type.floating && !type.fixed) { 589f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.width == 8) 590f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 591f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.width == 16) 592f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 593f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 594f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 595f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(intrinsic) 596f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 597f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 598f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 599f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a) && LLVMIsConstant(b)) 600f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 601f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstFSub(a, b); 602f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 603f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstSub(a, b); 604f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 605f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 606f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFSub(builder, a, b, ""); 607f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 608f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSub(builder, a, b, ""); 609f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 610f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 611f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_max_simple(bld, res, bld->zero); 612f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 613f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 614f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 615f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 616f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 617f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 618f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Normalized 8bit multiplication. 619f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 620f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - alpha plus one 621f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 622f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * makes the following approximation to the division (Sree) 623f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 624f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * a*b/255 ~= (a*(b + 1)) >> 256 625f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 626f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * which is the fastest method that satisfies the following OpenGL criteria 627f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 628f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 0*0 = 0 and 255*255 = 255 629f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 630f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - geometric series 631f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 632f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * takes the geometric series approximation to the division 633f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 634f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 635f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 636f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * in this case just the first two terms to fit in 16bit arithmetic 637f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 638f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * t/255 ~= (t + (t >> 8)) >> 8 639f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 640f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * note that just by itself it doesn't satisfies the OpenGL criteria, as 641f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 642f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * must be used 643f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 644f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - geometric series plus rounding 645f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 646f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * when using a geometric series division instead of truncating the result 647f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * use roundoff in the approximation (Jim Blinn) 648f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 649f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * t/255 ~= (t + (t >> 8) + 0x80) >> 8 650f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 651f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * achieving the exact results 652f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 653f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 654f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 655f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * @sa Michael Herf, The "double blend trick", May 2000, 656f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * http://www.stereopsis.com/doubleblend.html 657f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 658f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic LLVMValueRef 659f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_mul_u8n(struct gallivm_state *gallivm, 660f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_type i16_type, 661f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, LLVMValueRef b) 662f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 663f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = gallivm->builder; 664f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef c8; 665f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef ab; 666f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 667f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(!i16_type.floating); 668f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(i16_type, a)); 669f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(i16_type, b)); 670f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 671f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c8 = lp_build_const_int_vec(gallivm, i16_type, 8); 672f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 673f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if 0 674f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 675f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* a*b/255 ~= (a*(b + 1)) >> 256 */ 676f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), ""); 677f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ab = LLVMBuildMul(builder, a, b, ""); 678f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 679f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#else 680f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 681f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 682f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ab = LLVMBuildMul(builder, a, b, ""); 683f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 684f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), ""); 685f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 686f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif 687f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 688f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ab = LLVMBuildLShr(builder, ab, c8, ""); 689f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 690f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return ab; 691f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 692f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 693f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 694f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 695f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate a * b 696f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 697f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 698f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_mul(struct lp_build_context *bld, 699f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 700f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 701f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 702f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 703f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 704f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef shift; 705f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 706f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 707f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 708f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, b)); 709f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 710f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->zero) 711f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 712f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one) 713f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return b; 714f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->zero) 715f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 716f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->one) 717f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 718f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef || b == bld->undef) 719f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 720f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 721f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(!type.floating && !type.fixed && type.norm) { 722f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.width == 8) { 723f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_type i16_type = lp_wider_type(type); 724f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef al, ah, bl, bh, abl, abh, ab; 725f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 726f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah); 727f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh); 728f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 729f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* PMULLW, PSRLW, PADDW */ 730f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl); 731f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh); 732f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 733f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh); 734f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 735f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return ab; 736f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 737f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 738f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* FIXME */ 739f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(0); 740f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 741f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 742f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.fixed) 743f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 744f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 745f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shift = NULL; 746f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 747f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 748f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 749f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstFMul(a, b); 750f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 751f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstMul(a, b); 752f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(shift) { 753f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.sign) 754f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstAShr(res, shift); 755f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 756f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMConstLShr(res, shift); 757f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 758f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 759f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 760f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 761f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, a, b, ""); 762f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 763f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildMul(builder, a, b, ""); 764f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(shift) { 765f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.sign) 766f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildAShr(builder, res, shift, ""); 767f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 768f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildLShr(builder, res, shift, ""); 769f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 770f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 771f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 772f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 773f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 774f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 775f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 776f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 777f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Small vector x scale multiplication optimization. 778f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 779f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 780f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_mul_imm(struct lp_build_context *bld, 781f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 782f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int b) 783f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 784f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 785f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef factor; 786f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 787f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, a)); 788f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 789f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == 0) 790f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 791f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 792f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == 1) 793f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 794f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 795f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == -1) 796f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_negate(bld, a); 797f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 798f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == 2 && bld->type.floating) 799f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_add(bld, a, a); 800f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 801f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(util_is_power_of_two(b)) { 802f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned shift = ffs(b) - 1; 803f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 804f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(bld->type.floating) { 805f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if 0 806f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 807f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Power of two multiplication by directly manipulating the exponent. 808f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 809f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * XXX: This might not be always faster, it will introduce a small error 810f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * for multiplication by zero, and it will produce wrong results 811f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * for Inf and NaN. 812f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 813f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mantissa = lp_mantissa(bld->type); 814f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 815f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 816f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildAdd(builder, a, factor, ""); 817f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 818f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 819f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif 820f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 821f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 822f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 823f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildShl(builder, a, factor, ""); 824f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 825f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 826f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 827f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 828f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_mul(bld, a, factor); 829f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 830f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 831f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 832f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 833f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate a / b 834f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 835f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 836f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_div(struct lp_build_context *bld, 837f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 838f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 839f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 840f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 841f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 842f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 843f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 844f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, b)); 845f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 846f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->zero) 847f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 848f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one) 849f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_rcp(bld, b); 850f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->zero) 851f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 852f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->one) 853f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 854f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef || b == bld->undef) 855f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 856f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 857f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 858f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 859f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstFDiv(a, b); 860f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.sign) 861f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstSDiv(a, b); 862f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 863f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstUDiv(a, b); 864f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 865f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 866f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 867f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && 868f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type.floating) 869f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 870f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 871f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 872f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildFDiv(builder, a, b, ""); 873f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.sign) 874f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildSDiv(builder, a, b, ""); 875f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 876f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildUDiv(builder, a, b, ""); 877f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 878f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 879f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 880f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 881f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Linear interpolation -- without any checks. 882f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 883f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * @sa http://www.stereopsis.com/doubleblend.html 884f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 885f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE LLVMValueRef 886f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_lerp_simple(struct lp_build_context *bld, 887f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 888f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v0, 889f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v1) 890f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 891f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 892f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef delta; 893f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 894f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 895f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 896f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, v0)); 897f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, v1)); 898f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 899f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org delta = lp_build_sub(bld, v1, v0); 900f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 901f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_mul(bld, x, delta); 902f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 903f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_add(bld, v0, res); 904f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 905f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (bld->type.fixed) { 906f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 907f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * but it will be wrong for other uses. Basically we need a more 908f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * powerful lp_type, capable of further distinguishing the values 909f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * interpretation from the value storage. */ 910f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), ""); 911f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 912f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 913f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 914f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 915f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 916f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 917f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 918f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Linear interpolation. 919f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 920f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 921f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_lerp(struct lp_build_context *bld, 922f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 923f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v0, 924f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v1) 925f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 926f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 927f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 928f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 929f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 930f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, x)); 931f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, v0)); 932f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, v1)); 933f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 934f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.norm) { 935f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_type wide_type; 936f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_build_context wide_bld; 937f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 938f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef shift; 939f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 940f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.length >= 2); 941f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(!type.sign); 942f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 943f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 944f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Create a wider type, enough to hold the intermediate result of the 945f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * multiplication. 946f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 947f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org memset(&wide_type, 0, sizeof wide_type); 948f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org wide_type.fixed = TRUE; 949f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org wide_type.width = type.width*2; 950f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org wide_type.length = type.length/2; 951f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 952f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 953f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 954f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); 955f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 956f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 957f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 958f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 959f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Scale x from [0, 255] to [0, 256] 960f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 961f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 962f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1); 963f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 964f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org xl = lp_build_add(&wide_bld, xl, 965f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuildAShr(builder, xl, shift, "")); 966f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org xh = lp_build_add(&wide_bld, xh, 967f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuildAShr(builder, xh, shift, "")); 968f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 969f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 970f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Lerp both halves. 971f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 972f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 973f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); 974f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); 975f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 976f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); 977f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } else { 978f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_lerp_simple(bld, x, v0, v1); 979f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 980f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 981f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 982f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 983f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 984f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 985f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 986f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_lerp_2d(struct lp_build_context *bld, 987f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 988f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y, 989f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v00, 990f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v01, 991f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v10, 992f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v11) 993f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 994f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 995f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 996f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_lerp(bld, y, v0, v1); 997f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 998f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 999f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1000f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1001f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate min(a, b) 1002f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Do checks for special cases. 1003f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1004f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1005f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_min(struct lp_build_context *bld, 1006f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1007f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 1008f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1009f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, a)); 1010f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, b)); 1011f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1012f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef || b == bld->undef) 1013f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1014f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1015f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == b) 1016f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1017f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1018f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (bld->type.norm) { 1019f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (!bld->type.sign) { 1020f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (a == bld->zero || b == bld->zero) { 1021f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->zero; 1022f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1023f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1024f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one) 1025f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return b; 1026f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(b == bld->one) 1027f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1028f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1029f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1030f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_min_simple(bld, a, b); 1031f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1032f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1033f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1034f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1035f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate max(a, b) 1036f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Do checks for special cases. 1037f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1038f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1039f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_max(struct lp_build_context *bld, 1040f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1041f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef b) 1042f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1043f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, a)); 1044f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, b)); 1045f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1046f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef || b == bld->undef) 1047f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1048f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1049f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == b) 1050f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1051f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1052f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(bld->type.norm) { 1053f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one || b == bld->one) 1054f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->one; 1055f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (!bld->type.sign) { 1056f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (a == bld->zero) { 1057f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return b; 1058f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1059f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (b == bld->zero) { 1060f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1061f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1062f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1063f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1064f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1065f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_max_simple(bld, a, b); 1066f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1067f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1068f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1069f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1070f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate clamp(a, min, max) 1071f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Do checks for special cases. 1072f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1073f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1074f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_clamp(struct lp_build_context *bld, 1075f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1076f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef min, 1077f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef max) 1078f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1079f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, a)); 1080f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, min)); 1081f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, max)); 1082f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1083f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = lp_build_min(bld, a, max); 1084f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = lp_build_max(bld, a, min); 1085f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1086f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1087f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1088f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1089f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1090f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate abs(a) 1091f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1092f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1093f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_abs(struct lp_build_context *bld, 1094f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1095f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1096f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1097f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1098f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1099f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1100f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1101f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1102f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(!type.sign) 1103f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1104f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1105f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.floating) { 1106f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Mask out the sign bit */ 1107f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1108f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned long long absMask = ~(1ULL << (type.width - 1)); 1109f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 1110f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1111f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildAnd(builder, a, mask, ""); 1112f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildBitCast(builder, a, vec_type, ""); 1113f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1114f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1115f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1116f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 1117f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org switch(type.width) { 1118f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 8: 1119f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1120f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 16: 1121f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1122f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 32: 1123f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1124f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1125f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1126f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 && 1127f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (gallivm_debug & GALLIVM_DEBUG_PERF) && 1128f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (type.width == 8 || type.width == 16 || type.width == 32)) { 1129f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient code, should split vectors manually\n", 1130f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 1131f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1132f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1133f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 1134f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1135f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1136f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1137f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1138f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_negate(struct lp_build_context *bld, 1139f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1140f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1141f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1142f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1143f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, a)); 1144f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1145f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if HAVE_LLVM >= 0x0207 1146f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (bld->type.floating) 1147f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildFNeg(builder, a, ""); 1148f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 1149f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif 1150f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org a = LLVMBuildNeg(builder, a, ""); 1151f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1152f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return a; 1153f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1154f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1155f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1156f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** Return -1, 0 or +1 depending on the sign of a */ 1157f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1158f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_sgn(struct lp_build_context *bld, 1159f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1160f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1161f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1162f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1163f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef cond; 1164f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1165f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1166f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1167f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1168f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Handle non-zero case */ 1169f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(!type.sign) { 1170f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* if not zero then sign must be positive */ 1171f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = bld->one; 1172f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1173f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if(type.floating) { 1174f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type; 1175f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_type; 1176f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mask; 1177f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign; 1178f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef one; 1179f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1180f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1181f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int_type = lp_build_int_vec_type(bld->gallivm, type); 1182f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vec_type = lp_build_vec_type(bld->gallivm, type); 1183f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1184f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1185f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Take the sign bit and add it to 1 constant */ 1186f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildBitCast(builder, a, int_type, ""); 1187f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildAnd(builder, sign, mask, ""); 1188f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org one = LLVMConstBitCast(bld->one, int_type); 1189f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildOr(builder, sign, one, ""); 1190f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildBitCast(builder, res, vec_type, ""); 1191f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1192f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 1193f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org { 1194f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* signed int/norm/fixed point */ 1195f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* could use psign with sse3 and appropriate vectors here */ 1196f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1197f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1198f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_select(bld, cond, bld->one, minus_one); 1199f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1200f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1201f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Handle zero */ 1202f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1203f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_select(bld, cond, bld->zero, res); 1204f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1205f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1206f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1207f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1208f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1209f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1210f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Set the sign of float vector 'a' according to 'sign'. 1211f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * If sign==0, return abs(a). 1212f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * If sign==1, return -abs(a); 1213f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Other values for sign produce undefined results. 1214f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1215f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1216f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_set_sign(struct lp_build_context *bld, 1217f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, LLVMValueRef sign) 1218f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1219f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1220f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1221f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1222f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1223f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1224f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1225f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ~((unsigned long long) 1 << (type.width - 1))); 1226f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef val, res; 1227f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1228f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1229f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1230f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1231f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* val = reinterpret_cast<int>(a) */ 1232f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1233f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* val = val & mask */ 1234f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org val = LLVMBuildAnd(builder, val, mask, ""); 1235f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* sign = sign << shift */ 1236f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildShl(builder, sign, shift, ""); 1237f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* res = val | sign */ 1238f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildOr(builder, val, sign, ""); 1239f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* res = reinterpret_cast<float>(res) */ 1240f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildBitCast(builder, res, vec_type, ""); 1241f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1242f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1243f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1244f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1245f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1246f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1247f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Convert vector of (or scalar) int to vector of (or scalar) float. 1248f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1249f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1250f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_int_to_float(struct lp_build_context *bld, 1251f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1252f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1253f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1254f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1255f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1256f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1257f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1258f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1259f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildSIToFP(builder, a, vec_type, ""); 1260f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1261f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1262f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic boolean 1263f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgsse41_rounding_available(const struct lp_type type) 1264f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1265f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if ((util_cpu_caps.has_sse4_1 && 1266f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (type.length == 1 || type.width*type.length == 128)) || 1267f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (util_cpu_caps.has_avx && type.width*type.length == 256)) 1268f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return TRUE; 1269f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1270f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return FALSE; 1271f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1272f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1273f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgenum lp_build_round_sse41_mode 1274f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1275f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LP_BUILD_ROUND_SSE41_NEAREST = 0, 1276f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LP_BUILD_ROUND_SSE41_FLOOR = 1, 1277f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LP_BUILD_ROUND_SSE41_CEIL = 2, 1278f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LP_BUILD_ROUND_SSE41_TRUNCATE = 3 1279f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}; 1280f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1281f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1282f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1283f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Helper for SSE4.1's ROUNDxx instructions. 1284f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1285f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the 1286f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. 1287f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1288f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE LLVMValueRef 1289f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_round_sse41(struct lp_build_context *bld, 1290f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1291f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org enum lp_build_round_sse41_mode mode) 1292f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1293f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1294f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1295f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1296f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic; 1297f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1298f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1299f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1300f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1301f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1302f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(util_cpu_caps.has_sse4_1); 1303f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1304f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 1305f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type; 1306f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef undef; 1307f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef args[3]; 1308f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1309f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1310f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org switch(type.width) { 1311f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 32: 1312f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.round.ss"; 1313f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org break; 1314f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 64: 1315f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.round.sd"; 1316f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org break; 1317f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org default: 1318f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(0); 1319f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1320f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1321f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1322f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vec_type = LLVMVectorType(bld->elem_type, 4); 1323f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1324f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org undef = LLVMGetUndef(vec_type); 1325f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1326f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org args[0] = undef; 1327f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1328f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org args[2] = LLVMConstInt(i32t, mode, 0); 1329f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1330f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_intrinsic(builder, intrinsic, 1331f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vec_type, args, Elements(args)); 1332f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1333f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildExtractElement(builder, res, index0, ""); 1334f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1335f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1336f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width * type.length == 128) { 1337f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org switch(type.width) { 1338f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 32: 1339f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.round.ps"; 1340f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org break; 1341f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 64: 1342f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse41.round.pd"; 1343f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org break; 1344f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org default: 1345f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(0); 1346f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1347f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1348f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1349f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1350f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.width * type.length == 256); 1351f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(util_cpu_caps.has_avx); 1352f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1353f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org switch(type.width) { 1354f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 32: 1355f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.round.ps.256"; 1356f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org break; 1357f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org case 64: 1358f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.round.pd.256"; 1359f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org break; 1360f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org default: 1361f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(0); 1362f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1363f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1364f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1365f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1366f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_intrinsic_binary(builder, intrinsic, 1367f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org bld->vec_type, a, 1368f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMConstInt(i32t, mode, 0)); 1369f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1370f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1371f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1372f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1373f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1374f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1375f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE LLVMValueRef 1376f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_iround_nearest_sse2(struct lp_build_context *bld, 1377f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1378f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1379f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1380f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1381f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1382f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1383f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic; 1384f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1385f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1386f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1387f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* using the double precision conversions is a bit more complicated */ 1388f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.width == 32); 1389f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1390f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1391f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(util_cpu_caps.has_sse2); 1392f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1393f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1394f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 1395f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type; 1396f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef undef; 1397f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef arg; 1398f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1399f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1400f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org vec_type = LLVMVectorType(bld->elem_type, 4); 1401f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1402f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.cvtss2si"; 1403f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1404f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org undef = LLVMGetUndef(vec_type); 1405f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1406f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1407f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1408f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_intrinsic_unary(builder, intrinsic, 1409f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ret_type, arg); 1410f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1411f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1412f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.width* type.length == 128) { 1413f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse2.cvtps2dq"; 1414f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1415f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1416f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.width*type.length == 256); 1417f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(util_cpu_caps.has_avx); 1418f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1419f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 1420f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1421f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_intrinsic_unary(builder, intrinsic, 1422f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ret_type, a); 1423f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1424f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1425f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1426f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1427f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1428f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1429f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1430f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return the integer part of a float (vector) value (== round toward zero). 1431f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The returned value is a float (vector). 1432f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: trunc(-1.5) = -1.0 1433f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1434f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1435f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_trunc(struct lp_build_context *bld, 1436f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1437f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1438f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1439f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1440f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1441f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1442f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1443f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1444f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1445f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 1446f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1447f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1448f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1449f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1450f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1451f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1452f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1453f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1454f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1455f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1456f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1457f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1458f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1459f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return float (vector) rounded to nearest integer (vector). The returned 1460f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * value is a float (vector). 1461f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: round(0.9) = 1.0 1462f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: round(-1.5) = -2.0 1463f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1464f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1465f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_round(struct lp_build_context *bld, 1466f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1467f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1468f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1469f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1470f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1471f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1472f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1473f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1474f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1475f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1476f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1477f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1478f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1479f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1480f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_iround(bld, a); 1481f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1482f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1483f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1484f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1485f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1486f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1487f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1488f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return floor of float (vector), result is a float (vector) 1489f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: floor(1.1) = 1.0 1490f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: floor(-1.1) = -2.0 1491f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1492f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1493f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_floor(struct lp_build_context *bld, 1494f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1495f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1496f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1497f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1498f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1499f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1500f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1501f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1502f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1503f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1504f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1505f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1506f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1507f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1508f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_ifloor(bld, a); 1509f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1510f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1511f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1512f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1513f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1514f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1515f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1516f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return ceiling of float (vector), returning float (vector). 1517f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: ceil( 1.1) = 2.0 1518f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: ceil(-1.1) = -1.0 1519f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1520f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1521f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_ceil(struct lp_build_context *bld, 1522f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1523f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1524f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1525f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1526f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1527f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1528f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1529f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1530f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1531f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1532f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1533f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1534f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1535f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1536f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_iceil(bld, a); 1537f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1538f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1539f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1540f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1541f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1542f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1543f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1544f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return fractional part of 'a' computed as a - floor(a) 1545f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Typically used in texture coord arithmetic. 1546f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1547f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1548f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_fract(struct lp_build_context *bld, 1549f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1550f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1551f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(bld->type.floating); 1552f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1553f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1554f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1555f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1556f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1557f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Prevent returning a fractional part of 1.0 for very small negative values of 1558f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 'a' by clamping against 0.99999(9). 1559f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1560f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic inline LLVMValueRef 1561f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgclamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 1562f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1563f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef max; 1564f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1565f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* this is the largest number smaller than 1.0 representable as float */ 1566f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org max = lp_build_const_vec(bld->gallivm, bld->type, 1567f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 1568f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_min(bld, fract, max); 1569f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1570f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1571f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1572f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1573f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Same as lp_build_fract, but guarantees that the result is always smaller 1574f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * than one. 1575f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1576f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1577f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_fract_safe(struct lp_build_context *bld, 1578f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1579f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1580f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return clamp_fract(bld, lp_build_fract(bld, a)); 1581f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1582f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1583f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1584f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1585f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return the integer part of a float (vector) value (== round toward zero). 1586f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The returned value is an integer (vector). 1587f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: itrunc(-1.5) = -1 1588f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1589f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1590f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_itrunc(struct lp_build_context *bld, 1591f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1592f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1593f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1594f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1595f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1596f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1597f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1598f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1599f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1600f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1601f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1602f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1603f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1604f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1605f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return float (vector) rounded to nearest integer (vector). The returned 1606f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * value is an integer (vector). 1607f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: iround(0.9) = 1 1608f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: iround(-1.5) = -2 1609f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1610f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1611f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_iround(struct lp_build_context *bld, 1612f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1613f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1614f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1615f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1616f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = bld->int_vec_type; 1617f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1618f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1619f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1620f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1621f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1622f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1623f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if ((util_cpu_caps.has_sse2 && 1624f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ((type.width == 32) && (type.length == 1 || type.length == 4))) || 1625f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 1626f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_iround_nearest_sse2(bld, a); 1627f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1628f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1629f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1630f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1631f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1632f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef half; 1633f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1634f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org half = lp_build_const_vec(bld->gallivm, type, 0.5); 1635f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1636f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.sign) { 1637f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = bld->vec_type; 1638f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1639f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (unsigned long long)1 << (type.width - 1)); 1640f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign; 1641f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1642f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* get sign bit */ 1643f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1644f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildAnd(builder, sign, mask, ""); 1645f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1646f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* sign * 0.5 */ 1647f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 1648f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org half = LLVMBuildOr(builder, sign, half, ""); 1649f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org half = LLVMBuildBitCast(builder, half, vec_type, ""); 1650f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1651f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1652f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFAdd(builder, a, half, ""); 1653f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1654f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1655f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 1656f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1657f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1658f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1659f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1660f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1661f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1662f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return floor of float (vector), result is an int (vector) 1663f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: ifloor(1.1) = 1.0 1664f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: ifloor(-1.1) = -2.0 1665f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1666f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1667f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_ifloor(struct lp_build_context *bld, 1668f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1669f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1670f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1671f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1672f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = bld->int_vec_type; 1673f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1674f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1675f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1676f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1677f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1678f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = a; 1679f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.sign) { 1680f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1681f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1682f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1683f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1684f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Take the sign bit and add it to 1 constant */ 1685f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = bld->vec_type; 1686f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mantissa = lp_mantissa(type); 1687f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1688f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (unsigned long long)1 << (type.width - 1)); 1689f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign; 1690f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef offset; 1691f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1692f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* sign = a < 0 ? ~0 : 0 */ 1693f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1694f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildAnd(builder, sign, mask, ""); 1695f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildAShr(builder, sign, 1696f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, 1697f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type.width - 1), 1698f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org "ifloor.sign"); 1699f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1700f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* offset = -0.99999(9)f */ 1701f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = lp_build_const_vec(bld->gallivm, type, 1702f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1703f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = LLVMConstBitCast(offset, int_vec_type); 1704f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1705f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* offset = a < 0 ? offset : 0.0f */ 1706f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = LLVMBuildAnd(builder, offset, sign, ""); 1707f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset"); 1708f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1709f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFAdd(builder, res, offset, "ifloor.res"); 1710f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1711f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1712f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1713f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* round to nearest (toward zero) */ 1714f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 1715f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1716f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1717f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1718f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1719f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1720f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1721f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Return ceiling of float (vector), returning int (vector). 1722f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: iceil( 1.1) = 2 1723f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: iceil(-1.1) = -1 1724f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1725f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1726f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_iceil(struct lp_build_context *bld, 1727f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1728f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1729f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1730f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1731f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = bld->int_vec_type; 1732f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1733f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1734f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1735f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1736f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1737f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1738f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1739f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1740f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1741f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = bld->vec_type; 1742f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mantissa = lp_mantissa(type); 1743f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef offset; 1744f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1745f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* offset = 0.99999(9)f */ 1746f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = lp_build_const_vec(bld->gallivm, type, 1747f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1748f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1749f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.sign) { 1750f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1751f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (unsigned long long)1 << (type.width - 1)); 1752f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign; 1753f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1754f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* sign = a < 0 ? 0 : ~0 */ 1755f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1756f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildAnd(builder, sign, mask, ""); 1757f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildAShr(builder, sign, 1758f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, 1759f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org type.width - 1), 1760f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org "iceil.sign"); 1761f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org sign = LLVMBuildNot(builder, sign, "iceil.not"); 1762f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1763f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* offset = a < 0 ? 0.0 : offset */ 1764f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = LLVMConstBitCast(offset, int_vec_type); 1765f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = LLVMBuildAnd(builder, offset, sign, ""); 1766f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset"); 1767f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1768f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1769f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFAdd(builder, a, offset, "iceil.res"); 1770f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1771f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1772f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* round to nearest (toward zero) */ 1773f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 1774f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1775f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1776f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1777f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1778f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1779f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1780f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Combined ifloor() & fract(). 1781f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1782f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Preferred to calling the functions separately, as it will ensure that the 1783f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * strategy (floor() vs ifloor()) that results in less redundant work is used. 1784f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1785f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 1786f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_ifloor_fract(struct lp_build_context *bld, 1787f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1788f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *out_ipart, 1789f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *out_fpart) 1790f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1791f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1792f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1793f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef ipart; 1794f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1795f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1796f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1797f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1798f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (sse41_rounding_available(type)) { 1799f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 1800f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * floor() is easier. 1801f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1802f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1803f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ipart = lp_build_floor(bld, a); 1804f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1805f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 1806f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1807f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1808f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 1809f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * ifloor() is easier. 1810f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1811f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1812f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *out_ipart = lp_build_ifloor(bld, a); 1813f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 1814f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1815f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1816f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1817f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1818f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1819f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1820f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 1821f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * always smaller than one. 1822f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1823f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 1824f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_ifloor_fract_safe(struct lp_build_context *bld, 1825f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1826f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *out_ipart, 1827f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *out_fpart) 1828f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1829f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 1830f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *out_fpart = clamp_fract(bld, *out_fpart); 1831f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1832f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1833f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1834f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1835f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_sqrt(struct lp_build_context *bld, 1836f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1837f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1838f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1839f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1840f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1841f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org char intrinsic[32]; 1842f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1843f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1844f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1845f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 1846f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1847f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1848f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 1) { 1849f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width); 1850f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1851f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1852f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1853f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1854f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1855f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1856f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1857f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1858f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1859f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1860f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Do one Newton-Raphson step to improve reciprocate precision: 1861f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1862f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x_{i+1} = x_i * (2 - a * x_i) 1863f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1864f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 1865f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 1866f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 1867f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * halo. It would be necessary to clamp the argument to prevent this. 1868f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1869f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * See also: 1870f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 1871f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1872f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1873f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE LLVMValueRef 1874f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_rcp_refine(struct lp_build_context *bld, 1875f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1876f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef rcp_a) 1877f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1878f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1879f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 1880f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1881f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1882f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, a, rcp_a, ""); 1883f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFSub(builder, two, res, ""); 1884f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, rcp_a, res, ""); 1885f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1886f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1887f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1888f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1889f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1890f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1891f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_rcp(struct lp_build_context *bld, 1892f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1893f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1894f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1895f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1896f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1897f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1898f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1899f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->zero) 1900f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1901f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->one) 1902f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->one; 1903f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(a == bld->undef) 1904f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 1905f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1906f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1907f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1908f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(LLVMIsConstant(a)) 1909f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMConstFDiv(bld->one, a); 1910f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1911f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 1912f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * We don't use RCPPS because: 1913f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - it only has 10bits of precision 1914f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - it doesn't even get the reciprocate of 1.0 exactly 1915f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 1916f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - for recent processors the benefit over DIVPS is marginal, a case 1917f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * dependent 1918f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1919f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * We could still use it on certain processors if benchmarks show that the 1920f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 1921f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * particular uses that require less workarounds. 1922f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1923f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1924f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1925f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ 1926f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const unsigned num_iterations = 0; 1927f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1928f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned i; 1929f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 1930f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1931f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 4) { 1932f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.rcp.ps"; 1933f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1934f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 1935f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.rcp.ps.256"; 1936f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1937f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1938f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 1939f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1940f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = 0; i < num_iterations; ++i) { 1941f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_rcp_refine(bld, a, res); 1942f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1943f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1944f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1945f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 1946f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1947f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildFDiv(builder, bld->one, a, ""); 1948f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1949f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1950f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1951f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1952f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Do one Newton-Raphson step to improve rsqrt precision: 1953f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1954f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 1955f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 1956f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * See also: 1957f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1958f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1959f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE LLVMValueRef 1960f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_rsqrt_refine(struct lp_build_context *bld, 1961f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a, 1962f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef rsqrt_a) 1963f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1964f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1965f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 1966f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 1967f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1968f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1969f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 1970f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, a, res, ""); 1971f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFSub(builder, three, res, ""); 1972f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 1973f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, half, res, ""); 1974f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1975f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 1976f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 1977f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1978f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1979f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 1980f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate 1/sqrt(a) 1981f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 1982f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 1983f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_rsqrt(struct lp_build_context *bld, 1984f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 1985f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 1986f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 1987f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 1988f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1989f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, a)); 1990f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1991f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 1992f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1993f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1994f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 1995f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const unsigned num_iterations = 1; 1996f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 1997f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned i; 1998f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const char *intrinsic = NULL; 1999f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2000f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.length == 4) { 2001f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.sse.rsqrt.ps"; 2002f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2003f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else { 2004f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2005f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2006f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2007f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2008f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2009f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2010f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = 0; i < num_iterations; ++i) { 2011f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_rsqrt_refine(bld, a, res); 2012f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2013f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2014f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 2015f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2016f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2017f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2018f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2019f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2020f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2021f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2022f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate sin(a) using SSE2 2023f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2024f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2025f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_sin(struct lp_build_context *bld, 2026f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 2027f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2028f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct gallivm_state *gallivm = bld->gallivm; 2029f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = gallivm->builder; 2030f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_type int_type = lp_int_type(bld->type); 2031f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef b = builder; 2032f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2033f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2034f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * take the absolute value, 2035f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2036f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2037f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2038f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2039f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2040f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2041f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2042f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2043f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2044f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2045f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * extract the sign bit (upper one) 2046f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 2047f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2048f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2049f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 2050f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2051f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2052f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * scale by 4/Pi 2053f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2054f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2055f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2056f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2057f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2058f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2059f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2060f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * store the integer part of y in mm0 2061f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_cvttps_epi32(y); 2062f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2063f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2064f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2065f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2066f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2067f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * j=(j+1) & (~1) (see the cephes sources) 2068f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2069f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2070f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2071f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2072f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2073f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2074f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2075f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2076f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2077f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2078f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2079f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2080f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_cvtepi32_ps(emm2); 2081f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2082f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2083f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2084f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* get the swap sign flag 2085f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 2086f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2087f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2088f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 2089f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2090f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2091f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_slli_epi32(emm0, 29); 2092f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2093f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2094f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 2095f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2096f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2097f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * get the polynom selection mask 2098f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * there is one polynom for 0 <= x <= Pi/4 2099f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * and another one for Pi/4<x<=Pi/2 2100f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Both branches will be computed. 2101f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2102f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2103f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2104f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2105f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2106f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2107f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 2108f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef poly_mask = lp_build_compare(gallivm, 2109f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int_type, PIPE_FUNC_EQUAL, 2110f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2111f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2112f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 2113f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2114f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 2115f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2116f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2117f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(minus_cephes_DP1, -0.78515625); 2118f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2119f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2120f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2121f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2122f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2123f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2124f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2125f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2126f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The magic pass: "Extended precision modular arithmetic" 2127f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = ((x - y * DP1) - y * DP2) - y * DP3; 2128f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm1 = _mm_mul_ps(y, xmm1); 2129f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm2 = _mm_mul_ps(y, xmm2); 2130f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm3 = _mm_mul_ps(y, xmm3); 2131f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2132f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 2133f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 2134f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 2135f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2136f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2137f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_add_ps(x, xmm1); 2138f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_add_ps(x, xmm2); 2139f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_add_ps(x, xmm3); 2140f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2141f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2142f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2143f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2144f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2145f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2146f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2147f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Evaluate the first polynom (0 <= x <= Pi/4) 2148f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2149f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * z = _mm_mul_ps(x,x); 2150f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2151f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2152f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2153f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2154f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2155f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2156f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2157f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2158f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2159f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2160f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2161f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2162f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2163f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = *(v4sf*)_ps_coscof_p0; 2164f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_mul_ps(y, z); 2165f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2166f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2167f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2168f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2169f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2170f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2171f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2172f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2173f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2174f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2175f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2176f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_sub_ps(y, tmp); 2177f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2178f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2179f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2180f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2181f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2182f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2183f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2184f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2185f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2186f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(sincof_p0, -1.9515295891E-4); 2187f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(sincof_p1, 8.3321608736E-3); 2188f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(sincof_p2, -1.6666654611E-1); 2189f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2190f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2191f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2192f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2193f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2194f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2195f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Evaluate the second polynom (Pi/4 <= x <= 0) 2196f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2197f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = *(v4sf*)_ps_sincof_p0; 2198f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, z); 2199f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2200f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, z); 2201f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2202f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, z); 2203f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, x); 2204f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_add_ps(y2, x); 2205f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2206f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2207f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2208f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2209f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2210f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2211f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2212f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2213f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2214f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2215f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2216f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * select the correct result from the two polynoms 2217f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm3 = poly_mask; 2218f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2219f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_andnot_ps(xmm3, y); 2220f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_add_ps(y,y2); 2221f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2222f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2223f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2224f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2225f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 2226f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2227f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2228f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2229f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2230f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2231f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * update the sign 2232f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_xor_ps(y, sign_bit); 2233f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2234f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 2235f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2236f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return y_result; 2237f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2238f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2239f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2240f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2241f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate cos(a) using SSE2 2242f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2243f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2244f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_cos(struct lp_build_context *bld, 2245f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a) 2246f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2247f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct gallivm_state *gallivm = bld->gallivm; 2248f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = gallivm->builder; 2249f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct lp_type int_type = lp_int_type(bld->type); 2250f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef b = builder; 2251f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2252f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2253f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * take the absolute value, 2254f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2255f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2256f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2257f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2258f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2259f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2260f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2261f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2262f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2263f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2264f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * scale by 4/Pi 2265f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2266f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2267f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2268f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2269f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2270f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2271f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2272f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * store the integer part of y in mm0 2273f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_cvttps_epi32(y); 2274f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2275f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2276f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2277f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2278f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2279f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * j=(j+1) & (~1) (see the cephes sources) 2280f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2281f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2282f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2283f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2284f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2285f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2286f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2287f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2288f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2289f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2290f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2291f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2292f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_cvtepi32_ps(emm2); 2293f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2294f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2295f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2296f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2297f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2298f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 2299f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2300f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2301f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 2302f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2303f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2304f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* get the swap sign flag 2305f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 2306f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2307f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 2308f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 2309f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2310f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 2311f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2312f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2313f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_slli_epi32(emm0, 29); 2314f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2315f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2316f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 2317f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2318f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2319f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * get the polynom selection mask 2320f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * there is one polynom for 0 <= x <= Pi/4 2321f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * and another one for Pi/4<x<=Pi/2 2322f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Both branches will be computed. 2323f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2324f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2325f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2326f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2327f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2328f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2329f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 2330f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef poly_mask = lp_build_compare(gallivm, 2331f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int_type, PIPE_FUNC_EQUAL, 2332f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2333f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2334f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2335f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(minus_cephes_DP1, -0.78515625); 2336f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2337f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2338f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2339f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2340f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2341f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2342f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2343f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2344f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The magic pass: "Extended precision modular arithmetic" 2345f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = ((x - y * DP1) - y * DP2) - y * DP3; 2346f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm1 = _mm_mul_ps(y, xmm1); 2347f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm2 = _mm_mul_ps(y, xmm2); 2348f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm3 = _mm_mul_ps(y, xmm3); 2349f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2350f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 2351f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 2352f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 2353f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2354f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2355f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_add_ps(x, xmm1); 2356f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_add_ps(x, xmm2); 2357f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x = _mm_add_ps(x, xmm3); 2358f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2359f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2360f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2361f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2362f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2363f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2364f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2365f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Evaluate the first polynom (0 <= x <= Pi/4) 2366f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2367f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * z = _mm_mul_ps(x,x); 2368f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2369f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2370f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2371f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2372f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2373f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2374f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2375f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2376f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2377f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2378f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2379f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2380f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2381f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = *(v4sf*)_ps_coscof_p0; 2382f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_mul_ps(y, z); 2383f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2384f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2385f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2386f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2387f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2388f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2389f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2390f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2391f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2392f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2393f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2394f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_sub_ps(y, tmp); 2395f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2396f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2397f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2398f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2399f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2400f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2401f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2402f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2403f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2404f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(sincof_p0, -1.9515295891E-4); 2405f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(sincof_p1, 8.3321608736E-3); 2406f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * _PS_CONST(sincof_p2, -1.6666654611E-1); 2407f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2408f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2409f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2410f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2411f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2412f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2413f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Evaluate the second polynom (Pi/4 <= x <= 0) 2414f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2415f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = *(v4sf*)_ps_sincof_p0; 2416f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, z); 2417f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2418f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, z); 2419f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2420f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, z); 2421f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_mul_ps(y2, x); 2422f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_add_ps(y2, x); 2423f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2424f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2425f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2426f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2427f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2428f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2429f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2430f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2431f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2432f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2433f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2434f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * select the correct result from the two polynoms 2435f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * xmm3 = poly_mask; 2436f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2437f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_andnot_ps(xmm3, y); 2438f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_add_ps(y,y2); 2439f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2440f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2441f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2442f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2443f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2444f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2445f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2446f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2447f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2448f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * update the sign 2449f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * y = _mm_xor_ps(y, sign_bit); 2450f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2451f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 2452f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2453f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return y_result; 2454f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2455f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2456f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2457f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2458f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate pow(x, y) 2459f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2460f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2461f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_pow(struct lp_build_context *bld, 2462f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 2463f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y) 2464f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2465f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 2466f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (gallivm_debug & GALLIVM_DEBUG_PERF && 2467f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMIsConstant(x) && LLVMIsConstant(y)) { 2468f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2469f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 2470f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2471f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2472f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 2473f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2474f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2475f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2476f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2477f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate exp(x) 2478f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2479f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2480f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_exp(struct lp_build_context *bld, 2481f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2482f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2483f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* log2(e) = 1/log(2) */ 2484f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 2485f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1.4426950408889634); 2486f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2487f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2488f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2489f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 2490f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2491f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2492f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2493f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2494f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate log(x) 2495f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2496f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2497f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_log(struct lp_build_context *bld, 2498f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2499f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2500f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* log(2) */ 2501f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 2502f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.69314718055994529); 2503f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2504f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2505f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2506f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 2507f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2508f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2509f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2510f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2511f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Generate polynomial. 2512f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 2513f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2514f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic LLVMValueRef 2515f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_polynomial(struct lp_build_context *bld, 2516f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 2517f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const double *coeffs, 2518f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned num_coeffs) 2519f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2520f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 2521f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef even = NULL, odd = NULL; 2522f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x2; 2523f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned i; 2524f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2525f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2526f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2527f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 2528f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (gallivm_debug & GALLIVM_DEBUG_PERF && 2529f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMIsConstant(x)) { 2530f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2531f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 2532f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2533f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2534f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2535f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Calculate odd and even terms seperately to decrease data dependency 2536f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Ex: 2537f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * c[0] + x^2 * c[2] + x^4 * c[4] ... 2538f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 2539f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2540f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x2 = lp_build_mul(bld, x, x); 2541f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2542f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = num_coeffs; i--; ) { 2543f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef coeff; 2544f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2545f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 2546f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2547f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (i % 2 == 0) { 2548f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (even) 2549f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even)); 2550f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 2551f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org even = coeff; 2552f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } else { 2553f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (odd) 2554f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd)); 2555f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 2556f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org odd = coeff; 2557f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2558f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2559f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2560f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (odd) 2561f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return lp_build_add(bld, lp_build_mul(bld, odd, x), even); 2562f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (even) 2563f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return even; 2564f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 2565f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return bld->undef; 2566f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2567f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2568f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2569f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2570f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Minimax polynomial fit of 2**x, in range [0, 1[ 2571f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2572f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgconst double lp_build_exp2_polynomial[] = { 2573f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if EXP_POLY_DEGREE == 5 2574f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.999999925063526176901, 2575f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.693153073200168932794, 2576f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.240153617044375388211, 2577f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.0558263180532956664775, 2578f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.00898934009049466391101, 2579f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.00187757667519147912699 2580f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#elif EXP_POLY_DEGREE == 4 2581f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1.00000259337069434683, 2582f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.693003834469974940458, 2583f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.24144275689150793076, 2584f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.0520114606103070150235, 2585f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.0135341679161270268764 2586f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#elif EXP_POLY_DEGREE == 3 2587f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.999925218562710312959, 2588f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.695833540494823811697, 2589f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.226067155427249155588, 2590f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.0780245226406372992967 2591f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#elif EXP_POLY_DEGREE == 2 2592f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 1.00172476321474503578, 2593f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.657636275736077639316, 2594f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.33718943461968720704 2595f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#else 2596f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#error 2597f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif 2598f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}; 2599f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2600f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2601f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 2602f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_exp2_approx(struct lp_build_context *bld, 2603f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 2604f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *p_exp2_int_part, 2605f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *p_frac_part, 2606f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *p_exp2) 2607f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2608f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2609f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 2610f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2611f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef ipart = NULL; 2612f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef fpart = NULL; 2613f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef expipart = NULL; 2614f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef expfpart = NULL; 2615f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res = NULL; 2616f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2617f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2618f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2619f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp2_int_part || p_frac_part || p_exp2) { 2620f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 2621f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (gallivm_debug & GALLIVM_DEBUG_PERF && 2622f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMIsConstant(x)) { 2623f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2624f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 2625f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2626f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2627f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating && type.width == 32); 2628f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2629f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0)); 2630f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999)); 2631f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2632f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* ipart = floor(x) */ 2633f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* fpart = x - ipart */ 2634f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_ifloor_fract(bld, x, &ipart, &fpart); 2635f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2636f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2637f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp2_int_part || p_exp2) { 2638f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* expipart = (float) (1 << ipart) */ 2639f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org expipart = LLVMBuildAdd(builder, ipart, 2640f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2641f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org expipart = LLVMBuildShl(builder, expipart, 2642f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2643f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 2644f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2645f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2646f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp2) { 2647f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 2648f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org Elements(lp_build_exp2_polynomial)); 2649f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2650f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFMul(builder, expipart, expfpart, ""); 2651f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2652f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2653f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp2_int_part) 2654f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *p_exp2_int_part = expipart; 2655f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2656f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_frac_part) 2657f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *p_frac_part = fpart; 2658f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2659f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp2) 2660f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *p_exp2 = res; 2661f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2662f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2663f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2664f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2665f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_exp2(struct lp_build_context *bld, 2666f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2667f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2668f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 2669f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_exp2_approx(bld, x, NULL, NULL, &res); 2670f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 2671f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2672f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2673f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2674f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2675f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Extract the exponent of a IEEE-754 floating point value. 2676f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2677f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Optionally apply an integer bias. 2678f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2679f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Result is an integer value with 2680f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2681f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * ifloor(log2(x)) + bias 2682f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2683f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2684f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_extract_exponent(struct lp_build_context *bld, 2685f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 2686f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int bias) 2687f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2688f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2689f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 2690f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mantissa = lp_mantissa(type); 2691f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 2692f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2693f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 2694f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2695f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2696f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2697f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2698f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2699f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildLShr(builder, x, 2700f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 2701f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildAnd(builder, res, 2702f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, 255), ""); 2703f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSub(builder, res, 2704f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 2705f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2706f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 2707f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2708f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2709f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2710f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2711f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Extract the mantissa of the a floating. 2712f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2713f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Result is a floating point value with 2714f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2715f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * x / floor(log2(x)) 2716f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2717f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2718f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_extract_mantissa(struct lp_build_context *bld, 2719f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2720f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2721f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2722f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 2723f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mantissa = lp_mantissa(type); 2724f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 2725f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org (1ULL << mantissa) - 1); 2726f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 2727f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 2728f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2729f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2730f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2731f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating); 2732f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2733f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2734f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2735f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* res = x / 2**ipart */ 2736f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildAnd(builder, x, mantmask, ""); 2737f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildOr(builder, res, one, ""); 2738f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 2739f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2740f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 2741f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2742f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2743f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2744f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2745f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2746f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 2747f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * These coefficients can be generate with 2748f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 2749f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2750f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgconst double lp_build_log2_polynomial[] = { 2751f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if LOG_POLY_DEGREE == 5 2752f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2.88539008148777786488L, 2753f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.961796878841293367824L, 2754f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.577058946784739859012L, 2755f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.412914355135828735411L, 2756f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.308591899232910175289L, 2757f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.352376952300281371868L, 2758f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#elif LOG_POLY_DEGREE == 4 2759f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2.88539009343309178325L, 2760f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.961791550404184197881L, 2761f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.577440339438736392009L, 2762f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.403343858251329912514L, 2763f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.406718052498846252698L, 2764f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#elif LOG_POLY_DEGREE == 3 2765f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2.88538959748872753838L, 2766f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.961932915889597772928L, 2767f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.571118517972136195241L, 2768f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0.493997535084709500285L, 2769f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#else 2770f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#error 2771f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif 2772f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}; 2773f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2774f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2775f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * See http://www.devmaster.net/forums/showthread.php?p=43580 2776f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * http://en.wikipedia.org/wiki/Logarithm#Calculation 2777f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * http://www.nezumi.demon.co.uk/consult/logx.htm 2778f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2779f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 2780f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_log2_approx(struct lp_build_context *bld, 2781f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 2782f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *p_exp, 2783f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *p_floor_log2, 2784f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef *p_log2) 2785f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2786f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2787f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 2788f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2789f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2790f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2791f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 2792f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 2793f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 2794f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2795f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef i = NULL; 2796f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y = NULL; 2797f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef z = NULL; 2798f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef exp = NULL; 2799f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef mant = NULL; 2800f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef logexp = NULL; 2801f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef logmant = NULL; 2802f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res = NULL; 2803f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2804f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2805f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2806f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp || p_floor_log2 || p_log2) { 2807f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* TODO: optimize the constant case */ 2808f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (gallivm_debug & GALLIVM_DEBUG_PERF && 2809f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMIsConstant(x)) { 2810f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2811f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __FUNCTION__); 2812f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2813f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2814f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(type.floating && type.width == 32); 2815f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2816f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* 2817f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * We don't explicitly handle denormalized numbers. They will yield a 2818f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * result in the neighbourhood of -127, which appears to be adequate 2819f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * enough. 2820f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2821f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2822f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 2823f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2824f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* exp = (float) exponent(x) */ 2825f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org exp = LLVMBuildAnd(builder, i, expmask, ""); 2826f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2827f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2828f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_floor_log2 || p_log2) { 2829f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2830f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2831f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 2832f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2833f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2834f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_log2) { 2835f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* mant = 1 + (float) mantissa(x) */ 2836f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mant = LLVMBuildAnd(builder, i, mantmask, ""); 2837f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mant = LLVMBuildOr(builder, mant, one, ""); 2838f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 2839f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2840f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* y = (mant - 1) / (mant + 1) */ 2841f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org y = lp_build_div(bld, 2842f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_sub(bld, mant, bld->one), 2843f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_add(bld, mant, bld->one) 2844f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ); 2845f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2846f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* z = y^2 */ 2847f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org z = lp_build_mul(bld, y, y); 2848f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2849f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* compute P(z) */ 2850f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 2851f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org Elements(lp_build_log2_polynomial)); 2852f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2853f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* logmant = y * P(z) */ 2854f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org logmant = lp_build_mul(bld, y, logmant); 2855f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2856f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = lp_build_add(bld, logmant, logexp); 2857f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2858f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2859f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_exp) { 2860f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 2861f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *p_exp = exp; 2862f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 2863f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2864f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_floor_log2) 2865f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *p_floor_log2 = logexp; 2866f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2867f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if(p_log2) 2868f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *p_log2 = res; 2869f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2870f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2871f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2872f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2873f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_log2(struct lp_build_context *bld, 2874f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2875f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2876f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 2877f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_build_log2_approx(bld, x, NULL, NULL, &res); 2878f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 2879f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2880f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2881f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2882f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2883f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Faster (and less accurate) log2. 2884f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2885f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 2886f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2887f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Piece-wise linear approximation, with exact results when x is a 2888f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * power of two. 2889f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2890f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * See http://www.flipcode.com/archives/Fast_log_Function.shtml 2891f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2892f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2893f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_fast_log2(struct lp_build_context *bld, 2894f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2895f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2896f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2897f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef ipart; 2898f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef fpart; 2899f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2900f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2901f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2902f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(bld->type.floating); 2903f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2904f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* ipart = floor(log2(x)) - 1 */ 2905f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ipart = lp_build_extract_exponent(bld, x, -1); 2906f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 2907f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2908f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* fpart = x / 2**ipart */ 2909f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org fpart = lp_build_extract_mantissa(bld, x); 2910f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2911f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* ipart + fpart */ 2912f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return LLVMBuildFAdd(builder, ipart, fpart, ""); 2913f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2914f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2915f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2916f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 2917f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Fast implementation of iround(log2(x)). 2918f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 2919f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Not an approximation -- it should give accurate results all the time. 2920f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 2921f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2922f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_ilog2(struct lp_build_context *bld, 2923f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x) 2924f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2925f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2926f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 2927f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef ipart; 2928f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2929f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(bld->type.floating); 2930f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2931f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(bld->type, x)); 2932f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2933f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 2934f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x = LLVMBuildFMul(builder, x, sqrt2, ""); 2935f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2936f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* ipart = floor(log2(x) + 0.5) */ 2937f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org ipart = lp_build_extract_exponent(bld, x, 0); 2938f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2939f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return ipart; 2940f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2941f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2942f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgLLVMValueRef 2943f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_build_mod(struct lp_build_context *bld, 2944f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef x, 2945f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef y) 2946f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 2947f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMBuilderRef builder = bld->gallivm->builder; 2948f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org LLVMValueRef res; 2949f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_type type = bld->type; 2950f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2951f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, x)); 2952f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(lp_check_value(type, y)); 2953f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 2954f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (type.floating) 2955f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildFRem(builder, x, y, ""); 2956f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else if (type.sign) 2957f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildSRem(builder, x, y, ""); 2958f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org else 2959f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org res = LLVMBuildURem(builder, x, y, ""); 2960f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return res; 2961f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 2962