1bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/************************************************************************** 2bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 3bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Copyright 2010 VMware, Inc. 4bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * All Rights Reserved. 5bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 6bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Permission is hereby granted, free of charge, to any person obtaining a 7bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * copy of this software and associated documentation files (the 8bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * "Software"), to deal in the Software without restriction, including 9bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * without limitation the rights to use, copy, modify, merge, publish, 10bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * distribute, sub license, and/or sell copies of the Software, and to 11bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * permit persons to whom the Software is furnished to do so, subject to 12bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * the following conditions: 13bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 14bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 17bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 18bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * USE OR OTHER DEALINGS IN THE SOFTWARE. 21bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 22bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * The above copyright notice and this permission notice (including the 23bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * next paragraph) shall be included in all copies or substantial portions 24bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * of the Software. 25bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 26bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca **************************************************************************/ 27bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 28bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 29bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "util/u_debug.h" 30437d7e1bafedeea5c69e495b57c215977c727617José Fonseca#include "util/u_cpu_detect.h" 318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#include "util/u_math.h" 32bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_debug.h" 33bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_const.h" 34bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_format.h" 35bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_gather.h" 36437d7e1bafedeea5c69e495b57c215977c727617José Fonseca#include "lp_bld_swizzle.h" 378ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger#include "lp_bld_type.h" 38efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul#include "lp_bld_init.h" 39e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella#include "lp_bld_intr.h" 408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#include "lp_bld_pack.h" 41bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 42bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 43bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/** 44bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Get the pointer to one element from scatter positions in memory. 45bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 46bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @sa lp_build_gather() 47bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca */ 48bb1546f55be3b243b71d39e5fb7457c5b21e32c9José FonsecaLLVMValueRef 49efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paullp_build_gather_elem_ptr(struct gallivm_state *gallivm, 50bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned length, 51bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef base_ptr, 52bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef offsets, 53bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned i) 54bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca{ 55bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef offset; 56bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef ptr; 57bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 58efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 59bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 60bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca if (length == 1) { 61bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca assert(i == 0); 62bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca offset = offsets; 63bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca } else { 64efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul LLVMValueRef index = lp_build_const_int32(gallivm, i); 65efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, ""); 66bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca } 67bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 68efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, ""); 69bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 70bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca return ptr; 71bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca} 72bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 73bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 74bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/** 75bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Gather one element from scatter positions in memory. 76bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 77bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @sa lp_build_gather() 78bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca */ 79bb1546f55be3b243b71d39e5fb7457c5b21e32c9José FonsecaLLVMValueRef 80efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paullp_build_gather_elem(struct gallivm_state *gallivm, 81bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned length, 82bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned src_width, 83bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned dst_width, 8474f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger boolean aligned, 85bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef base_ptr, 86bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef offsets, 872151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson unsigned i, 882151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson boolean vector_justify) 89bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca{ 90efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); 91bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); 92efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); 93bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef ptr; 94bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef res; 95bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 96efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 97bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 98efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); 99efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); 100efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul res = LLVMBuildLoad(gallivm->builder, ptr, ""); 101bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 10274f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger /* XXX 10374f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * On some archs we probably really want to avoid having to deal 10474f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * with alignments lower than 4 bytes (if fetch size is a power of 10574f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * two >= 32). On x86 it doesn't matter, however. 10674f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * We should be able to guarantee full alignment for any kind of texture 10774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch 10874f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends 10974f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * but I don't think that's quite what we wanted). 11074f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT 11174f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * looks like a good fit, but it seems this cap bit (and OpenGL) aren't 11274f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * enforcing what we want (which is what d3d10 does, the offset needs to 11374f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * be aligned to element size, but GL has bytes regardless of element 11474f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * size which would only leave us with minimum alignment restriction of 16 11574f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * which doesn't make much sense if the type isn't 4x32bit). Due to 11674f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * translation of offsets to first_elem in sampler_views it actually seems 11774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * gallium could not do anything else except 16 no matter what... 11874f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger */ 1198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (!aligned) { 120bcfb86b09de3bfc9c7cdf6925658b5e529a8fc62José Fonseca LLVMSetAlignment(res, 1); 1218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else if (!util_is_power_of_two(src_width)) { 1228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 1238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Full alignment is impossible, assume the caller really meant 1248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * the individual elements were aligned (e.g. 3x32bit format). 1258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * And yes the generated code may otherwise crash, llvm will 1268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * really assume 128bit alignment with a 96bit fetch (I suppose 1278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * that makes sense as it can just assume the upper 32bit to be 1288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * whatever). 1298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Maybe the caller should be able to explicitly set this, but 1308bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * this should cover all the 3-channel formats. 1318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 1328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (((src_width / 24) * 24 == src_width) && 1338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger util_is_power_of_two(src_width / 24)) { 1348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMSetAlignment(res, src_width / 24); 1358bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 1368bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMSetAlignment(res, 1); 1378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 13874f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger } 13974f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger 140bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca assert(src_width <= dst_width); 1418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (src_width < dst_width) { 142efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); 1432151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson if (vector_justify) { 144e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella#ifdef PIPE_ARCH_BIG_ENDIAN 1452151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson res = LLVMBuildShl(gallivm->builder, res, 1462151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson LLVMConstInt(dst_elem_type, dst_width - src_width, 0), ""); 147e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella#endif 1482151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson } 149e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella } 150bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 151bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca return res; 152bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca} 153bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 154bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 1558bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger/** 1568bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Gather one element from scatter positions in memory. 1578bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Nearly the same as above, however the individual elements 1588bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * may be vectors themselves, and fetches may be float type. 1598bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Can also do pad vector instead of ZExt. 1608bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * 1618bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * @sa lp_build_gather() 1628bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 1638bd67a35c50e68c21aed043de11e095c284d151aRoland Scheideggerstatic LLVMValueRef 1648bd67a35c50e68c21aed043de11e095c284d151aRoland Scheideggerlp_build_gather_elem_vec(struct gallivm_state *gallivm, 1658bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger unsigned length, 1668bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger unsigned src_width, 1678bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMTypeRef src_type, 1688bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type dst_type, 1698bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger boolean aligned, 1708bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMValueRef base_ptr, 1718bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMValueRef offsets, 1728bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger unsigned i, 1738bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger boolean vector_justify) 1748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger{ 1758bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMValueRef ptr, res; 1768bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); 1778bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 1788bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 1798bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); 1808bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); 1818bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMBuildLoad(gallivm->builder, ptr, ""); 1828bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 1838bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* XXX 1848bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * On some archs we probably really want to avoid having to deal 1858bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * with alignments lower than 4 bytes (if fetch size is a power of 1868bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * two >= 32). On x86 it doesn't matter, however. 1878bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * We should be able to guarantee full alignment for any kind of texture 1888bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch 1898bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends 1908bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * but I don't think that's quite what we wanted). 1918bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT 1928bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * looks like a good fit, but it seems this cap bit (and OpenGL) aren't 1938bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * enforcing what we want (which is what d3d10 does, the offset needs to 1948bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * be aligned to element size, but GL has bytes regardless of element 1958bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * size which would only leave us with minimum alignment restriction of 16 1968bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * which doesn't make much sense if the type isn't 4x32bit). Due to 1978bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * translation of offsets to first_elem in sampler_views it actually seems 1988bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * gallium could not do anything else except 16 no matter what... 1998bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 2008bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (!aligned) { 2018bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMSetAlignment(res, 1); 2028bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else if (!util_is_power_of_two(src_width)) { 2038bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 2048bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Full alignment is impossible, assume the caller really meant 2058bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * the individual elements were aligned (e.g. 3x32bit format). 2068bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * And yes the generated code may otherwise crash, llvm will 2078bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * really assume 128bit alignment with a 96bit fetch (I suppose 2088bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * that makes sense as it can just assume the upper 32bit to be 2098bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * whatever). 2108bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Maybe the caller should be able to explicitly set this, but 2118bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * this should cover all the 3-channel formats. 2128bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 2138bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (((src_width / 24) * 24 == src_width) && 2148bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger util_is_power_of_two(src_width / 24)) { 2158bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMSetAlignment(res, src_width / 24); 2168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 2178bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMSetAlignment(res, 1); 2188bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 2198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 2208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 2218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(src_width <= dst_type.width * dst_type.length); 2228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (src_width < dst_type.width * dst_type.length) { 2238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (dst_type.length > 1) { 2248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = lp_build_pad_vector(gallivm, res, dst_type.length); 2258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 2268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * vector_justify hopefully a non-issue since we only deal 2278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * with src_width >= 32 here? 2288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 2298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 23056441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type); 23156441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger 2328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 2338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Only valid if src_ptr_type is int type... 2348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 23556441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); 23656441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger 2378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (vector_justify) { 2388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#ifdef PIPE_ARCH_BIG_ENDIAN 2398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMBuildShl(gallivm->builder, res, 2408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMConstInt(dst_elem_type, 2418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger dst_type.width - src_width, 0), ""); 2428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#endif 2438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 2448bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 2458bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 2468bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger return res; 2478bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger} 2488bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 2498bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 2508bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 2518bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 252437d7e1bafedeea5c69e495b57c215977c727617José Fonsecastatic LLVMValueRef 253437d7e1bafedeea5c69e495b57c215977c727617José Fonsecalp_build_gather_avx2(struct gallivm_state *gallivm, 254437d7e1bafedeea5c69e495b57c215977c727617José Fonseca unsigned length, 255437d7e1bafedeea5c69e495b57c215977c727617José Fonseca unsigned src_width, 2568bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type dst_type, 257437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef base_ptr, 258437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef offsets) 259437d7e1bafedeea5c69e495b57c215977c727617José Fonseca{ 260437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMBuilderRef builder = gallivm->builder; 2618bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMTypeRef src_type, src_vec_type; 262437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef res; 2638bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type res_type = dst_type; 2648bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res_type.length *= length; 265437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 2668bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (dst_type.floating) { 2678bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) : 2688bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMFloatTypeInContext(gallivm->context); 2698bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 2708bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_type = LLVMIntTypeInContext(gallivm->context, src_width); 2718bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 2728bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_vec_type = LLVMVectorType(src_type, length); 2738bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 2748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */ 275437d7e1bafedeea5c69e495b57c215977c727617José Fonseca assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 276437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 277437d7e1bafedeea5c69e495b57c215977c727617José Fonseca if (0) { 278437d7e1bafedeea5c69e495b57c215977c727617José Fonseca /* 279437d7e1bafedeea5c69e495b57c215977c727617José Fonseca * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but 2808bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * will not use the AVX2 gather instrinsics (even with llvm 4.0), at 2818bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * least with Haswell. See 282437d7e1bafedeea5c69e495b57c215977c727617José Fonseca * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html 2838bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * And the generated code doing the emulation is quite a bit worse 2848bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * than what we get by doing it ourselves too. 285437d7e1bafedeea5c69e495b57c215977c727617José Fonseca */ 286437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32); 287437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length); 288437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1); 289437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length); 290437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); 291437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef src_ptr; 292437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 293437d7e1bafedeea5c69e495b57c215977c727617José Fonseca base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, ""); 294437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 295437d7e1bafedeea5c69e495b57c215977c727617José Fonseca /* Rescale offsets from bytes to elements */ 296437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0); 297437d7e1bafedeea5c69e495b57c215977c727617José Fonseca scale = lp_build_broadcast(gallivm, i32_vec_type, scale); 298437d7e1bafedeea5c69e495b57c215977c727617José Fonseca assert(LLVMTypeOf(offsets) == i32_vec_type); 299437d7e1bafedeea5c69e495b57c215977c727617José Fonseca offsets = LLVMBuildSDiv(builder, offsets, scale, ""); 300437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 301437d7e1bafedeea5c69e495b57c215977c727617José Fonseca src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep"); 302437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 303437d7e1bafedeea5c69e495b57c215977c727617José Fonseca char intrinsic[64]; 3048bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u", 3058bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger length, dst_type.floating ? "f" : "i", src_width); 306437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0); 307437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type); 308437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef passthru = LLVMGetUndef(src_vec_type); 309437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 310437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef args[] = { src_ptr, alignment, mask, passthru }; 311437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 312437d7e1bafedeea5c69e495b57c215977c727617José Fonseca res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0); 313437d7e1bafedeea5c69e495b57c215977c727617José Fonseca } else { 314437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8); 315437d7e1bafedeea5c69e495b57c215977c727617José Fonseca const char *intrinsic = NULL; 3168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger unsigned l_idx = 0; 3178bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 3188bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(src_width == 32 || src_width == 64); 3198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (src_width == 32) { 3208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(length == 4 || length == 8); 3218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 3228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(length == 2 || length == 4); 323437d7e1bafedeea5c69e495b57c215977c727617José Fonseca } 324437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 3258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger static const char *intrinsics[2][2][2] = { 3268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 3278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger {{"llvm.x86.avx2.gather.d.d", 3288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger "llvm.x86.avx2.gather.d.d.256"}, 3298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger {"llvm.x86.avx2.gather.d.q", 3308bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger "llvm.x86.avx2.gather.d.q.256"}}, 3318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 3328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger {{"llvm.x86.avx2.gather.d.ps", 3338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger "llvm.x86.avx2.gather.d.ps.256"}, 3348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger {"llvm.x86.avx2.gather.d.pd", 3358bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger "llvm.x86.avx2.gather.d.pd.256"}}, 3368bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger }; 3378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 3388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if ((src_width == 32 && length == 8) || 3398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger (src_width == 64 && length == 4)) { 3408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger l_idx = 1; 3418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 3428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx]; 3438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 344437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef passthru = LLVMGetUndef(src_vec_type); 345437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef mask = LLVMConstAllOnes(src_vec_type); 346437d7e1bafedeea5c69e495b57c215977c727617José Fonseca mask = LLVMConstBitCast(mask, src_vec_type); 347437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0); 348437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 349437d7e1bafedeea5c69e495b57c215977c727617José Fonseca LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale }; 350437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 351437d7e1bafedeea5c69e495b57c215977c727617José Fonseca res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0); 352437d7e1bafedeea5c69e495b57c215977c727617José Fonseca } 3538bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), ""); 354437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 355437d7e1bafedeea5c69e495b57c215977c727617José Fonseca return res; 356437d7e1bafedeea5c69e495b57c215977c727617José Fonseca} 357437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 358437d7e1bafedeea5c69e495b57c215977c727617José Fonseca 359bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/** 360bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Gather elements from scatter positions in memory into a single vector. 361bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Use for fetching texels from a texture. 362bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * For SSE, typical values are length=4, src_width=32, dst_width=32. 363bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * 3642151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * When src_width < dst_width, the return value can be justified in 3652151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * one of two ways: 3662151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * "integer justification" is used when the caller treats the destination 3672151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * as a packed integer bitmask, as described by the channels' "shift" and 3682151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * "width" fields; 3692151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * "vector justification" is used when the caller casts the destination 3702151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * to a vector and needs channel X to be in vector element 0. 3712151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * 372bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @param length length of the offsets 373bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @param src_width src element width in bits 3748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * @param dst_type result element type (src will be expanded to fit, 3758bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * but truncation is not allowed) 3768bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * (this may be a vector, must be pot sized) 37774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * @param aligned whether the data is guaranteed to be aligned (to src_width) 3788bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * @param base_ptr base pointer, needs to be a i8 pointer type. 379bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @param offsets vector with offsets 3802151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * @param vector_justify select vector rather than integer justification 381bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca */ 382bb1546f55be3b243b71d39e5fb7457c5b21e32c9José FonsecaLLVMValueRef 383efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paullp_build_gather(struct gallivm_state *gallivm, 384bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned length, 385bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned src_width, 3868bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type dst_type, 38774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger boolean aligned, 388bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef base_ptr, 3892151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson LLVMValueRef offsets, 3902151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson boolean vector_justify) 391bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca{ 392bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca LLVMValueRef res; 3938bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger boolean need_expansion = src_width < dst_type.width * dst_type.length; 3948bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger boolean vec_fetch; 3958bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type fetch_type, fetch_dst_type; 3968bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMTypeRef src_type; 3978bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 3988bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(src_width <= dst_type.width * dst_type.length); 3998bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger 4008bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 4018bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * This is quite a mess... 4028bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Figure out if the fetch should be done as: 4038bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * a) scalar or vector 4048bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * b) float or int 4058bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * 4068bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * As an example, for a 96bit fetch expanded into 4x32bit, it is better 4078bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * to use (3x32bit) vector type (then pad the vector). Otherwise, the 4088bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * zext will cause extra instructions. 4098bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * However, the same isn't true for 3x16bit (the codegen for that is 4108bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * completely worthless on x86 simd, and for 3x8bit is is way worse 4118bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * still, don't try that... (To get really good code out of llvm for 4128bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * these cases, the only way is to decompose the fetches manually 4138bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter 4148bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * case requires sse41, otherwise simple scalar zext is way better. 4158bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * But probably not important enough, so don't bother.) 4168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Also, we try to honor the floating bit of destination (but isn't 4178bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * possible if caller asks for instance for 2x32bit dst_type with 4188bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * 48bit fetch - the idea would be to use 3x16bit fetch, pad and 4198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * cast to 2x32f type, so the fetch is always int and on top of that 4208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * we avoid the vec pad and use scalar zext due the above mentioned 4218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * issue). 4228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Note this is optimized for x86 sse2 and up backend. Could be tweaked 4238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * for other archs if necessary... 4248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 4258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) && 4268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger (dst_type.length > 1)) { 4278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* use vector fetch (if dst_type is vector) */ 4288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger vec_fetch = TRUE; 4298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (dst_type.floating) { 4308bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_type = lp_type_float_vec(dst_type.width, src_width); 4318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 4328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_type = lp_type_int_vec(dst_type.width, src_width); 4338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 4348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* intentionally not using lp_build_vec_type here */ 4358bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type), 4368bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_type.length); 4378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_dst_type = fetch_type; 4388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_dst_type.length = dst_type.length; 4398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 4408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* use scalar fetch */ 4418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger vec_fetch = FALSE; 4428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (dst_type.floating && ((src_width == 32) || (src_width == 64))) { 4438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_type = lp_type_float(src_width); 4448bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 4458bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_type = lp_type_int(src_width); 4468bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 4478bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_type = lp_build_vec_type(gallivm, fetch_type); 4488bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_dst_type = fetch_type; 4498bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_dst_type.width = dst_type.width * dst_type.length; 4508bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 451bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 452bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca if (length == 1) { 453bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca /* Scalar */ 4548bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = lp_build_gather_elem_vec(gallivm, length, 4558bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_width, src_type, fetch_dst_type, 4568bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger aligned, base_ptr, offsets, 0, 4578bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger vector_justify); 4588bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger return LLVMBuildBitCast(gallivm->builder, res, 4598bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger lp_build_vec_type(gallivm, dst_type), ""); 4608bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 4618bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Excluding expansion from these paths because if you need it for 4628bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * 32bit/64bit fetches you're doing it wrong (this is gather, not 4638bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * conversion) and it would be awkward for floats. 4648bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 4658bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else if (util_cpu_caps.has_avx2 && !need_expansion && 4668bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_width == 32 && (length == 4 || length == 8)) { 4678bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger return lp_build_gather_avx2(gallivm, length, src_width, dst_type, 4688bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger base_ptr, offsets); 4698bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 4708bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * This looks bad on paper wrt throughtput/latency on Haswell. 4718bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Even on Broadwell it doesn't look stellar. 4728bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Albeit no measurements were done (but tested to work). 4738bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Should definitely enable on Skylake. 4748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * (In general, should be more of a win if the fetch is 256bit wide - 4758bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * this is true for the 32bit case above too.) 4768bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 4778bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && 4788bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_width == 64 && (length == 2 || length == 4)) { 4798bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger return lp_build_gather_avx2(gallivm, length, src_width, dst_type, 4808bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger base_ptr, offsets); 481bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca } else { 482bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca /* Vector */ 483bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 4848bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8]; 485bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca unsigned i; 4868ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger boolean vec_zext = FALSE; 4878bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type res_type, gather_res_type; 4888bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger LLVMTypeRef res_t, gather_res_t; 4898ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger 4908bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res_type = fetch_dst_type; 4918bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res_type.length *= length; 4928bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger gather_res_type = res_type; 4938ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger 4948bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) { 4958ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger /* 4968ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * Note that llvm is never able to optimize zext/insert combos 4978ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * directly (i.e. zero the simd reg, then place the elements into 4988bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * the appropriate place directly). (I think this has to do with 4998bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * scalar/vector transition.) And scalar 16->32bit zext simd loads 5008ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * aren't possible (instead loading to scalar reg first). 5018ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * No idea about other archs... 5028ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * We could do this manually, but instead we just use a vector 5038ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * zext, which is simple enough (and, in fact, llvm might optimize 5048ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * this away). 5058ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * (We're not trying that with other bit widths as that might not be 5068ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger * easier, in particular with 8 bit values at least with only sse2.) 5078ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger */ 5088bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(vec_fetch == FALSE); 5098bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger gather_res_type.width /= 2; 5108bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger fetch_dst_type = fetch_type; 5118bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_type = lp_build_vec_type(gallivm, fetch_type); 5128ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger vec_zext = TRUE; 5138ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger } 5148bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res_t = lp_build_vec_type(gallivm, res_type); 5158bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger gather_res_t = lp_build_vec_type(gallivm, gather_res_type); 5168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMGetUndef(gather_res_t); 517bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca for (i = 0; i < length; ++i) { 518efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul LLVMValueRef index = lp_build_const_int32(gallivm, i); 5198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger elems[i] = lp_build_gather_elem_vec(gallivm, length, 5208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger src_width, src_type, fetch_dst_type, 5218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger aligned, base_ptr, offsets, i, 5228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger vector_justify); 5238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (!vec_fetch) { 5248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, ""); 5258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 526bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca } 5278ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger if (vec_zext) { 5288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMBuildZExt(gallivm->builder, res, res_t, ""); 5298ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger if (vector_justify) { 530ff81869f0dcd2210e5f09c2e0e0c116f46952734Dave Airlie#ifdef PIPE_ARCH_BIG_ENDIAN 5318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger unsigned sv = dst_type.width - src_width; 5328ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger res = LLVMBuildShl(gallivm->builder, res, 5338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger lp_build_const_int_vec(gallivm, res_type, sv), ""); 5348ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger#endif 5358ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger } 5368ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger } 5378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger if (vec_fetch) { 5388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger /* 5398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Do bitcast now otherwise llvm might get some funny ideas wrt 5408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * float/int types... 5418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */ 5428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger for (i = 0; i < length; i++) { 5438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i], 5448bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger lp_build_vec_type(gallivm, dst_type), ""); 5458bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 5468bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = lp_build_concat(gallivm, elems, dst_type, length); 5478bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } else { 5488bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger struct lp_type really_final_type = dst_type; 5498bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger assert(res_type.length * res_type.width == 5508bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger dst_type.length * dst_type.width * length); 5518bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger really_final_type.length *= length; 5528bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger res = LLVMBuildBitCast(gallivm->builder, res, 5538bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger lp_build_vec_type(gallivm, really_final_type), ""); 5548bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger } 555bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca } 556bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca 557bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca return res; 558bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca} 55952049744620854487012151a7ac26ca978905411Tom Stellard 56052049744620854487012151a7ac26ca978905411Tom StellardLLVMValueRef 56152049744620854487012151a7ac26ca978905411Tom Stellardlp_build_gather_values(struct gallivm_state * gallivm, 56252049744620854487012151a7ac26ca978905411Tom Stellard LLVMValueRef * values, 56352049744620854487012151a7ac26ca978905411Tom Stellard unsigned value_count) 56452049744620854487012151a7ac26ca978905411Tom Stellard{ 56552049744620854487012151a7ac26ca978905411Tom Stellard LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count); 56652049744620854487012151a7ac26ca978905411Tom Stellard LLVMBuilderRef builder = gallivm->builder; 56752049744620854487012151a7ac26ca978905411Tom Stellard LLVMValueRef vec = LLVMGetUndef(vec_type); 56852049744620854487012151a7ac26ca978905411Tom Stellard unsigned i; 56952049744620854487012151a7ac26ca978905411Tom Stellard 57052049744620854487012151a7ac26ca978905411Tom Stellard for (i = 0; i < value_count; i++) { 57152049744620854487012151a7ac26ca978905411Tom Stellard LLVMValueRef index = lp_build_const_int32(gallivm, i); 57252049744620854487012151a7ac26ca978905411Tom Stellard vec = LLVMBuildInsertElement(builder, vec, values[i], index, ""); 57352049744620854487012151a7ac26ca978905411Tom Stellard } 57452049744620854487012151a7ac26ca978905411Tom Stellard return vec; 57552049744620854487012151a7ac26ca978905411Tom Stellard} 576