1bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/**************************************************************************
2bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
3bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Copyright 2010 VMware, Inc.
4bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * All Rights Reserved.
5bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
6bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Permission is hereby granted, free of charge, to any person obtaining a
7bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * copy of this software and associated documentation files (the
8bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * "Software"), to deal in the Software without restriction, including
9bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * without limitation the rights to use, copy, modify, merge, publish,
10bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * distribute, sub license, and/or sell copies of the Software, and to
11bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * permit persons to whom the Software is furnished to do so, subject to
12bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * the following conditions:
13bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
14bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * USE OR OTHER DEALINGS IN THE SOFTWARE.
21bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
22bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * The above copyright notice and this permission notice (including the
23bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * next paragraph) shall be included in all copies or substantial portions
24bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * of the Software.
25bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
26bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca **************************************************************************/
27bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
28bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
29bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "util/u_debug.h"
30437d7e1bafedeea5c69e495b57c215977c727617José Fonseca#include "util/u_cpu_detect.h"
318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#include "util/u_math.h"
32bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_debug.h"
33bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_const.h"
34bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_format.h"
35bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca#include "lp_bld_gather.h"
36437d7e1bafedeea5c69e495b57c215977c727617José Fonseca#include "lp_bld_swizzle.h"
378ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger#include "lp_bld_type.h"
38efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul#include "lp_bld_init.h"
39e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella#include "lp_bld_intr.h"
408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#include "lp_bld_pack.h"
41bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
42bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
43bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/**
44bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Get the pointer to one element from scatter positions in memory.
45bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
46bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @sa lp_build_gather()
47bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca */
48bb1546f55be3b243b71d39e5fb7457c5b21e32c9José FonsecaLLVMValueRef
49efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paullp_build_gather_elem_ptr(struct gallivm_state *gallivm,
50bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                         unsigned length,
51bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                         LLVMValueRef base_ptr,
52bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                         LLVMValueRef offsets,
53bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                         unsigned i)
54bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca{
55bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   LLVMValueRef offset;
56bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   LLVMValueRef ptr;
57bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
58efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
59bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
60bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   if (length == 1) {
61bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      assert(i == 0);
62bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      offset = offsets;
63bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   } else {
64efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul      LLVMValueRef index = lp_build_const_int32(gallivm, i);
65efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul      offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
66bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   }
67bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
68efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
69bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
70bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   return ptr;
71bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca}
72bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
73bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
74bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/**
75bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Gather one element from scatter positions in memory.
76bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
77bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @sa lp_build_gather()
78bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca */
79bb1546f55be3b243b71d39e5fb7457c5b21e32c9José FonsecaLLVMValueRef
80efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paullp_build_gather_elem(struct gallivm_state *gallivm,
81bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                     unsigned length,
82bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                     unsigned src_width,
83bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                     unsigned dst_width,
8474f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger                     boolean aligned,
85bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                     LLVMValueRef base_ptr,
86bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                     LLVMValueRef offsets,
872151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson                     unsigned i,
882151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson                     boolean vector_justify)
89bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca{
90efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
91bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
92efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
93bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   LLVMValueRef ptr;
94bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   LLVMValueRef res;
95bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
96efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
97bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
98efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
99efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
100efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul   res = LLVMBuildLoad(gallivm->builder, ptr, "");
101bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
10274f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger   /* XXX
10374f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * On some archs we probably really want to avoid having to deal
10474f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * with alignments lower than 4 bytes (if fetch size is a power of
10574f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * two >= 32). On x86 it doesn't matter, however.
10674f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * We should be able to guarantee full alignment for any kind of texture
10774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
10874f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
10974f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * but I don't think that's quite what we wanted).
11074f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
11174f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
11274f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * enforcing what we want (which is what d3d10 does, the offset needs to
11374f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * be aligned to element size, but GL has bytes regardless of element
11474f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * size which would only leave us with minimum alignment restriction of 16
11574f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * which doesn't make much sense if the type isn't 4x32bit). Due to
11674f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * translation of offsets to first_elem in sampler_views it actually seems
11774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    * gallium could not do anything else except 16 no matter what...
11874f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger    */
1198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   if (!aligned) {
120bcfb86b09de3bfc9c7cdf6925658b5e529a8fc62José Fonseca      LLVMSetAlignment(res, 1);
1218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   } else if (!util_is_power_of_two(src_width)) {
1228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      /*
1238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * Full alignment is impossible, assume the caller really meant
1248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * the individual elements were aligned (e.g. 3x32bit format).
1258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * And yes the generated code may otherwise crash, llvm will
1268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * really assume 128bit alignment with a 96bit fetch (I suppose
1278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * that makes sense as it can just assume the upper 32bit to be
1288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * whatever).
1298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * Maybe the caller should be able to explicitly set this, but
1308bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * this should cover all the 3-channel formats.
1318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       */
1328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (((src_width / 24) * 24 == src_width) &&
1338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger           util_is_power_of_two(src_width / 24)) {
1348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          LLVMSetAlignment(res, src_width / 24);
1358bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
1368bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         LLVMSetAlignment(res, 1);
1378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
13874f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger   }
13974f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger
140bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   assert(src_width <= dst_width);
1418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   if (src_width < dst_width) {
142efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul      res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
1432151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson      if (vector_justify) {
144e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella#ifdef PIPE_ARCH_BIG_ENDIAN
1452151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson         res = LLVMBuildShl(gallivm->builder, res,
1462151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson                            LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
147e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella#endif
1482151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson      }
149e25abacc1883f1b2e09c32230e35ffae7df5e61bAdhemerval Zanella   }
150bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
151bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   return res;
152bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca}
153bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
154bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
1558bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger/**
1568bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Gather one element from scatter positions in memory.
1578bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Nearly the same as above, however the individual elements
1588bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * may be vectors themselves, and fetches may be float type.
1598bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * Can also do pad vector instead of ZExt.
1608bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger *
1618bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * @sa lp_build_gather()
1628bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger */
1638bd67a35c50e68c21aed043de11e095c284d151aRoland Scheideggerstatic LLVMValueRef
1648bd67a35c50e68c21aed043de11e095c284d151aRoland Scheideggerlp_build_gather_elem_vec(struct gallivm_state *gallivm,
1658bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         unsigned length,
1668bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         unsigned src_width,
1678bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         LLVMTypeRef src_type,
1688bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         struct lp_type dst_type,
1698bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         boolean aligned,
1708bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         LLVMValueRef base_ptr,
1718bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         LLVMValueRef offsets,
1728bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         unsigned i,
1738bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                         boolean vector_justify)
1748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger{
1758bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   LLVMValueRef ptr, res;
1768bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1778bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
1788bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
1798bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
1808bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
1818bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   res = LLVMBuildLoad(gallivm->builder, ptr, "");
1828bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
1838bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   /* XXX
1848bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * On some archs we probably really want to avoid having to deal
1858bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * with alignments lower than 4 bytes (if fetch size is a power of
1868bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * two >= 32). On x86 it doesn't matter, however.
1878bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * We should be able to guarantee full alignment for any kind of texture
1888bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
1898bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
1908bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * but I don't think that's quite what we wanted).
1918bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
1928bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
1938bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * enforcing what we want (which is what d3d10 does, the offset needs to
1948bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * be aligned to element size, but GL has bytes regardless of element
1958bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * size which would only leave us with minimum alignment restriction of 16
1968bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * which doesn't make much sense if the type isn't 4x32bit). Due to
1978bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * translation of offsets to first_elem in sampler_views it actually seems
1988bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * gallium could not do anything else except 16 no matter what...
1998bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    */
2008bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   if (!aligned) {
2018bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      LLVMSetAlignment(res, 1);
2028bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   } else if (!util_is_power_of_two(src_width)) {
2038bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      /*
2048bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * Full alignment is impossible, assume the caller really meant
2058bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * the individual elements were aligned (e.g. 3x32bit format).
2068bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * And yes the generated code may otherwise crash, llvm will
2078bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * really assume 128bit alignment with a 96bit fetch (I suppose
2088bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * that makes sense as it can just assume the upper 32bit to be
2098bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * whatever).
2108bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * Maybe the caller should be able to explicitly set this, but
2118bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * this should cover all the 3-channel formats.
2128bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       */
2138bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (((src_width / 24) * 24 == src_width) &&
2148bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger           util_is_power_of_two(src_width / 24)) {
2158bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          LLVMSetAlignment(res, src_width / 24);
2168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
2178bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         LLVMSetAlignment(res, 1);
2188bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
2198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   }
2208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
2218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   assert(src_width <= dst_type.width * dst_type.length);
2228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   if (src_width < dst_type.width * dst_type.length) {
2238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (dst_type.length > 1) {
2248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         res = lp_build_pad_vector(gallivm, res, dst_type.length);
2258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         /*
2268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * vector_justify hopefully a non-issue since we only deal
2278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * with src_width >= 32 here?
2288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          */
2298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
23056441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger         LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
23156441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger
2328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         /*
2338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * Only valid if src_ptr_type is int type...
2348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          */
23556441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger         res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
23656441708cf0188d9d8c852a6353fc45c03e6a22dRoland Scheidegger
2378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         if (vector_justify) {
2388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#ifdef PIPE_ARCH_BIG_ENDIAN
2398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         res = LLVMBuildShl(gallivm->builder, res,
2408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                            LLVMConstInt(dst_elem_type,
2418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                         dst_type.width - src_width, 0), "");
2428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger#endif
2438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         }
2448bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
2458bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   }
2468bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   return res;
2478bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger}
2488bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
2498bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
2508bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
2518bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
252437d7e1bafedeea5c69e495b57c215977c727617José Fonsecastatic LLVMValueRef
253437d7e1bafedeea5c69e495b57c215977c727617José Fonsecalp_build_gather_avx2(struct gallivm_state *gallivm,
254437d7e1bafedeea5c69e495b57c215977c727617José Fonseca                     unsigned length,
255437d7e1bafedeea5c69e495b57c215977c727617José Fonseca                     unsigned src_width,
2568bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                     struct lp_type dst_type,
257437d7e1bafedeea5c69e495b57c215977c727617José Fonseca                     LLVMValueRef base_ptr,
258437d7e1bafedeea5c69e495b57c215977c727617José Fonseca                     LLVMValueRef offsets)
259437d7e1bafedeea5c69e495b57c215977c727617José Fonseca{
260437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   LLVMBuilderRef builder = gallivm->builder;
2618bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   LLVMTypeRef src_type, src_vec_type;
262437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   LLVMValueRef res;
2638bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   struct lp_type res_type = dst_type;
2648bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   res_type.length *= length;
265437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
2668bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   if (dst_type.floating) {
2678bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
2688bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                   LLVMFloatTypeInContext(gallivm->context);
2698bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   } else {
2708bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
2718bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   }
2728bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   src_vec_type = LLVMVectorType(src_type, length);
2738bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
2748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
275437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
276437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
277437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   if (0) {
278437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      /*
279437d7e1bafedeea5c69e495b57c215977c727617José Fonseca       * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
2808bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
2818bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * least with Haswell. See
282437d7e1bafedeea5c69e495b57c215977c727617José Fonseca       * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
2838bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * And the generated code doing the emulation is quite a bit worse
2848bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * than what we get by doing it ourselves too.
285437d7e1bafedeea5c69e495b57c215977c727617José Fonseca       */
286437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
287437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
288437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
289437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
290437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
291437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef src_ptr;
292437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
293437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
294437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
295437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      /* Rescale offsets from bytes to elements */
296437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
297437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
298437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      assert(LLVMTypeOf(offsets) == i32_vec_type);
299437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      offsets = LLVMBuildSDiv(builder, offsets, scale, "");
300437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
301437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
302437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
303437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      char intrinsic[64];
3048bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
3058bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                    length, dst_type.floating ? "f" : "i", src_width);
306437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
307437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
308437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
309437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
310437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
311437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
312437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
313437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   } else {
314437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
315437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      const char *intrinsic = NULL;
3168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      unsigned l_idx = 0;
3178bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
3188bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      assert(src_width == 32 || src_width == 64);
3198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (src_width == 32) {
3208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         assert(length == 4 || length == 8);
3218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
3228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         assert(length == 2 || length == 4);
323437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      }
324437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
3258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      static const char *intrinsics[2][2][2] = {
3268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
3278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         {{"llvm.x86.avx2.gather.d.d",
3288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger           "llvm.x86.avx2.gather.d.d.256"},
3298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          {"llvm.x86.avx2.gather.d.q",
3308bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger           "llvm.x86.avx2.gather.d.q.256"}},
3318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
3328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         {{"llvm.x86.avx2.gather.d.ps",
3338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger           "llvm.x86.avx2.gather.d.ps.256"},
3348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          {"llvm.x86.avx2.gather.d.pd",
3358bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger           "llvm.x86.avx2.gather.d.pd.256"}},
3368bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      };
3378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
3388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if ((src_width == 32 && length == 8) ||
3398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          (src_width == 64 && length == 4)) {
3408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         l_idx = 1;
3418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
3428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
3438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
344437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
345437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
346437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      mask = LLVMConstBitCast(mask, src_vec_type);
347437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
348437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
349437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
350437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
351437d7e1bafedeea5c69e495b57c215977c727617José Fonseca      res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
352437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   }
3538bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
354437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
355437d7e1bafedeea5c69e495b57c215977c727617José Fonseca   return res;
356437d7e1bafedeea5c69e495b57c215977c727617José Fonseca}
357437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
358437d7e1bafedeea5c69e495b57c215977c727617José Fonseca
359bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca/**
360bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Gather elements from scatter positions in memory into a single vector.
361bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * Use for fetching texels from a texture.
362bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * For SSE, typical values are length=4, src_width=32, dst_width=32.
363bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca *
3642151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * When src_width < dst_width, the return value can be justified in
3652151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * one of two ways:
3662151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * "integer justification" is used when the caller treats the destination
3672151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * as a packed integer bitmask, as described by the channels' "shift" and
3682151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * "width" fields;
3692151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * "vector justification" is used when the caller casts the destination
3702151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * to a vector and needs channel X to be in vector element 0.
3712151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson *
372bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @param length length of the offsets
373bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @param src_width src element width in bits
3748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * @param dst_type result element type (src will be expanded to fit,
3758bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger *        but truncation is not allowed)
3768bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger *        (this may be a vector, must be pot sized)
37774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger * @param aligned whether the data is guaranteed to be aligned (to src_width)
3788bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger * @param base_ptr base pointer, needs to be a i8 pointer type.
379bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca * @param offsets vector with offsets
3802151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson * @param vector_justify select vector rather than integer justification
381bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca */
382bb1546f55be3b243b71d39e5fb7457c5b21e32c9José FonsecaLLVMValueRef
383efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paullp_build_gather(struct gallivm_state *gallivm,
384bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                unsigned length,
385bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                unsigned src_width,
3868bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                struct lp_type dst_type,
38774f505fa73eda0c9b5b1984bebb44cedac8e8794Roland Scheidegger                boolean aligned,
388bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca                LLVMValueRef base_ptr,
3892151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson                LLVMValueRef offsets,
3902151d893fbd4a4be092098170e2fbca8c35797a5Adam Jackson                boolean vector_justify)
391bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca{
392bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   LLVMValueRef res;
3938bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   boolean need_expansion = src_width < dst_type.width * dst_type.length;
3948bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   boolean vec_fetch;
3958bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   struct lp_type fetch_type, fetch_dst_type;
3968bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   LLVMTypeRef src_type;
3978bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
3988bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   assert(src_width <= dst_type.width * dst_type.length);
3998bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger
4008bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   /*
4018bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * This is quite a mess...
4028bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * Figure out if the fetch should be done as:
4038bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * a) scalar or vector
4048bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * b) float or int
4058bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    *
4068bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
4078bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
4088bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * zext will cause extra instructions.
4098bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * However, the same isn't true for 3x16bit (the codegen for that is
4108bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * completely worthless on x86 simd, and for 3x8bit is is way worse
4118bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * still, don't try that... (To get really good code out of llvm for
4128bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * these cases, the only way is to decompose the fetches manually
4138bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
4148bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * case requires sse41, otherwise simple scalar zext is way better.
4158bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * But probably not important enough, so don't bother.)
4168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * Also, we try to honor the floating bit of destination (but isn't
4178bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * possible if caller asks for instance for 2x32bit dst_type with
4188bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
4198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * cast to 2x32f type, so the fetch is always int and on top of that
4208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * we avoid the vec pad and use scalar zext due the above mentioned
4218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * issue).
4228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
4238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * for other archs if necessary...
4248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    */
4258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
4268bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       (dst_type.length > 1)) {
4278bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      /* use vector fetch (if dst_type is vector) */
4288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      vec_fetch = TRUE;
4298bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (dst_type.floating) {
4308bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         fetch_type = lp_type_float_vec(dst_type.width, src_width);
4318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
4328bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         fetch_type = lp_type_int_vec(dst_type.width, src_width);
4338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
4348bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      /* intentionally not using lp_build_vec_type here */
4358bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
4368bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                fetch_type.length);
4378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      fetch_dst_type = fetch_type;
4388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      fetch_dst_type.length = dst_type.length;
4398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    } else {
4408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      /* use scalar fetch */
4418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      vec_fetch = FALSE;
4428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
4438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         fetch_type = lp_type_float(src_width);
4448bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
4458bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         fetch_type = lp_type_int(src_width);
4468bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
4478bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      src_type = lp_build_vec_type(gallivm, fetch_type);
4488bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      fetch_dst_type = fetch_type;
4498bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      fetch_dst_type.width = dst_type.width * dst_type.length;
4508bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   }
451bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
452bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   if (length == 1) {
453bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      /* Scalar */
4548bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      res = lp_build_gather_elem_vec(gallivm, length,
4558bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                     src_width, src_type, fetch_dst_type,
4568bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                     aligned, base_ptr, offsets, 0,
4578bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                     vector_justify);
4588bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      return LLVMBuildBitCast(gallivm->builder, res,
4598bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                              lp_build_vec_type(gallivm, dst_type), "");
4608bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      /*
4618bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * Excluding expansion from these paths because if you need it for
4628bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * 32bit/64bit fetches you're doing it wrong (this is gather, not
4638bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       * conversion) and it would be awkward for floats.
4648bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger       */
4658bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
4668bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger              src_width == 32 && (length == 4 || length == 8)) {
4678bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
4688bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                  base_ptr, offsets);
4698bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   /*
4708bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * This looks bad on paper wrt throughtput/latency on Haswell.
4718bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * Even on Broadwell it doesn't look stellar.
4728bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * Albeit no measurements were done (but tested to work).
4738bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * Should definitely enable on Skylake.
4748bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * (In general, should be more of a win if the fetch is 256bit wide -
4758bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    * this is true for the 32bit case above too.)
4768bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger    */
4778bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
4788bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger              src_width == 64 && (length == 2 || length == 4)) {
4798bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
4808bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                  base_ptr, offsets);
481bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   } else {
482bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      /* Vector */
483bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
4848bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
485bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      unsigned i;
4868ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger      boolean vec_zext = FALSE;
4878bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      struct lp_type res_type, gather_res_type;
4888bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      LLVMTypeRef res_t, gather_res_t;
4898ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger
4908bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      res_type = fetch_dst_type;
4918bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      res_type.length *= length;
4928bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      gather_res_type = res_type;
4938ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger
4948bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
4958ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger         /*
4968ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * Note that llvm is never able to optimize zext/insert combos
4978ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * directly (i.e. zero the simd reg, then place the elements into
4988bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * the appropriate place directly). (I think this has to do with
4998bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * scalar/vector transition.) And scalar 16->32bit zext simd loads
5008ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * aren't possible (instead loading to scalar reg first).
5018ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * No idea about other archs...
5028ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * We could do this manually, but instead we just use a vector
5038ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * zext, which is simple enough (and, in fact, llvm might optimize
5048ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * this away).
5058ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * (We're not trying that with other bit widths as that might not be
5068ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          * easier, in particular with 8 bit values at least with only sse2.)
5078ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger          */
5088bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         assert(vec_fetch == FALSE);
5098bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         gather_res_type.width /= 2;
5108bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         fetch_dst_type = fetch_type;
5118bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         src_type = lp_build_vec_type(gallivm, fetch_type);
5128ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger         vec_zext = TRUE;
5138ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger      }
5148bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      res_t = lp_build_vec_type(gallivm, res_type);
5158bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
5168bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      res = LLVMGetUndef(gather_res_t);
517bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      for (i = 0; i < length; ++i) {
518efc82aef35a2aac5d2ed9774f6d28f2626796416Brian Paul         LLVMValueRef index = lp_build_const_int32(gallivm, i);
5198bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         elems[i] = lp_build_gather_elem_vec(gallivm, length,
5208bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                             src_width, src_type, fetch_dst_type,
5218bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                             aligned, base_ptr, offsets, i,
5228bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                             vector_justify);
5238bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         if (!vec_fetch) {
5248bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
5258bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         }
526bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca      }
5278ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger      if (vec_zext) {
5288bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
5298ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger         if (vector_justify) {
530ff81869f0dcd2210e5f09c2e0e0c116f46952734Dave Airlie#ifdef PIPE_ARCH_BIG_ENDIAN
5318bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger            unsigned sv = dst_type.width - src_width;
5328ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger            res = LLVMBuildShl(gallivm->builder, res,
5338bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                               lp_build_const_int_vec(gallivm, res_type, sv), "");
5348ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger#endif
5358ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger         }
5368ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3Roland Scheidegger      }
5378bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      if (vec_fetch) {
5388bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         /*
5398bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * Do bitcast now otherwise llvm might get some funny ideas wrt
5408bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          * float/int types...
5418bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger          */
5428bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         for (i = 0; i < length; i++) {
5438bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
5448bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                        lp_build_vec_type(gallivm, dst_type), "");
5458bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         }
5468bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         res = lp_build_concat(gallivm, elems, dst_type, length);
5478bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      } else {
5488bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         struct lp_type really_final_type = dst_type;
5498bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         assert(res_type.length * res_type.width ==
5508bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                dst_type.length * dst_type.width * length);
5518bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         really_final_type.length *= length;
5528bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger         res = LLVMBuildBitCast(gallivm->builder, res,
5538bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger                                lp_build_vec_type(gallivm, really_final_type), "");
5548bd67a35c50e68c21aed043de11e095c284d151aRoland Scheidegger      }
555bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   }
556bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca
557bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca   return res;
558bb1546f55be3b243b71d39e5fb7457c5b21e32c9José Fonseca}
55952049744620854487012151a7ac26ca978905411Tom Stellard
56052049744620854487012151a7ac26ca978905411Tom StellardLLVMValueRef
56152049744620854487012151a7ac26ca978905411Tom Stellardlp_build_gather_values(struct gallivm_state * gallivm,
56252049744620854487012151a7ac26ca978905411Tom Stellard                       LLVMValueRef * values,
56352049744620854487012151a7ac26ca978905411Tom Stellard                       unsigned value_count)
56452049744620854487012151a7ac26ca978905411Tom Stellard{
56552049744620854487012151a7ac26ca978905411Tom Stellard   LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
56652049744620854487012151a7ac26ca978905411Tom Stellard   LLVMBuilderRef builder = gallivm->builder;
56752049744620854487012151a7ac26ca978905411Tom Stellard   LLVMValueRef vec = LLVMGetUndef(vec_type);
56852049744620854487012151a7ac26ca978905411Tom Stellard   unsigned i;
56952049744620854487012151a7ac26ca978905411Tom Stellard
57052049744620854487012151a7ac26ca978905411Tom Stellard   for (i = 0; i < value_count; i++) {
57152049744620854487012151a7ac26ca978905411Tom Stellard      LLVMValueRef index = lp_build_const_int32(gallivm, i);
57252049744620854487012151a7ac26ca978905411Tom Stellard      vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
57352049744620854487012151a7ac26ca978905411Tom Stellard   }
57452049744620854487012151a7ac26ca978905411Tom Stellard   return vec;
57552049744620854487012151a7ac26ca978905411Tom Stellard}
576