lp_bld_sample_aos.c revision 3469715a8a171512cf9b528702e70393f01c6041
1/**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * Texture sampling -- AoS.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36#include "pipe/p_defines.h"
37#include "pipe/p_state.h"
38#include "util/u_debug.h"
39#include "util/u_dump.h"
40#include "util/u_memory.h"
41#include "util/u_math.h"
42#include "util/u_format.h"
43#include "util/u_cpu_detect.h"
44#include "lp_bld_debug.h"
45#include "lp_bld_type.h"
46#include "lp_bld_const.h"
47#include "lp_bld_conv.h"
48#include "lp_bld_arit.h"
49#include "lp_bld_bitarit.h"
50#include "lp_bld_logic.h"
51#include "lp_bld_swizzle.h"
52#include "lp_bld_pack.h"
53#include "lp_bld_flow.h"
54#include "lp_bld_gather.h"
55#include "lp_bld_format.h"
56#include "lp_bld_init.h"
57#include "lp_bld_sample.h"
58#include "lp_bld_sample_aos.h"
59#include "lp_bld_quad.h"
60
61
62/**
63 * Build LLVM code for texture coord wrapping, for nearest filtering,
64 * for scaled integer texcoords.
65 * \param block_length  is the length of the pixel block along the
66 *                      coordinate axis
67 * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
68 * \param length  the texture size along one dimension
69 * \param stride  pixel stride along the coordinate axis (in bytes)
70 * \param is_pot  if TRUE, length is a power of two
71 * \param wrap_mode  one of PIPE_TEX_WRAP_x
72 * \param out_offset  byte offset for the wrapped coordinate
73 * \param out_i  resulting sub-block pixel coordinate for coord0
74 */
75static void
76lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
77                                 unsigned block_length,
78                                 LLVMValueRef coord,
79                                 LLVMValueRef coord_f,
80                                 LLVMValueRef length,
81                                 LLVMValueRef stride,
82                                 boolean is_pot,
83                                 unsigned wrap_mode,
84                                 LLVMValueRef *out_offset,
85                                 LLVMValueRef *out_i)
86{
87   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
88   LLVMBuilderRef builder = bld->gallivm->builder;
89   LLVMValueRef length_minus_one;
90
91   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
92
93   switch(wrap_mode) {
94   case PIPE_TEX_WRAP_REPEAT:
95      if(is_pot)
96         coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
97      else {
98         struct lp_build_context *coord_bld = &bld->coord_bld;
99         LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
100         coord = lp_build_fract_safe(coord_bld, coord_f);
101         coord = lp_build_mul(coord_bld, coord, length_f);
102         coord = lp_build_itrunc(coord_bld, coord);
103      }
104      break;
105
106   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
107      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
108      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
109      break;
110
111   case PIPE_TEX_WRAP_CLAMP:
112   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
113   case PIPE_TEX_WRAP_MIRROR_REPEAT:
114   case PIPE_TEX_WRAP_MIRROR_CLAMP:
115   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
116   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
117   default:
118      assert(0);
119   }
120
121   lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
122                                  out_offset, out_i);
123}
124
125
126/**
127 * Build LLVM code for texture coord wrapping, for nearest filtering,
128 * for float texcoords.
129 * \param coord  the incoming texcoord (s,t,r or q)
130 * \param length  the texture size along one dimension
131 * \param is_pot  if TRUE, length is a power of two
132 * \param wrap_mode  one of PIPE_TEX_WRAP_x
133 * \param icoord  the texcoord after wrapping, as int
134 */
135static void
136lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
137                                   LLVMValueRef coord,
138                                   LLVMValueRef length,
139                                   boolean is_pot,
140                                   unsigned wrap_mode,
141                                   LLVMValueRef *icoord)
142{
143   struct lp_build_context *coord_bld = &bld->coord_bld;
144   LLVMValueRef length_minus_one;
145
146   switch(wrap_mode) {
147   case PIPE_TEX_WRAP_REPEAT:
148      /* take fraction, unnormalize */
149      coord = lp_build_fract_safe(coord_bld, coord);
150      coord = lp_build_mul(coord_bld, coord, length);
151      *icoord = lp_build_itrunc(coord_bld, coord);
152      break;
153   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
154      length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
155      if (bld->static_state->normalized_coords) {
156         /* scale coord to length */
157         coord = lp_build_mul(coord_bld, coord, length);
158      }
159      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
160                             length_minus_one);
161      *icoord = lp_build_itrunc(coord_bld, coord);
162      break;
163
164   case PIPE_TEX_WRAP_CLAMP:
165   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
166   case PIPE_TEX_WRAP_MIRROR_REPEAT:
167   case PIPE_TEX_WRAP_MIRROR_CLAMP:
168   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
169   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
170   default:
171      assert(0);
172   }
173}
174
175
176/**
177 * Build LLVM code for texture coord wrapping, for linear filtering,
178 * for scaled integer texcoords.
179 * \param block_length  is the length of the pixel block along the
180 *                      coordinate axis
181 * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
182 * \param length  the texture size along one dimension
183 * \param stride  pixel stride along the coordinate axis (in bytes)
184 * \param is_pot  if TRUE, length is a power of two
185 * \param wrap_mode  one of PIPE_TEX_WRAP_x
186 * \param offset0  resulting relative offset for coord0
187 * \param offset1  resulting relative offset for coord0 + 1
188 * \param i0  resulting sub-block pixel coordinate for coord0
189 * \param i1  resulting sub-block pixel coordinate for coord0 + 1
190 */
191static void
192lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
193                                unsigned block_length,
194                                LLVMValueRef coord0,
195                                LLVMValueRef *weight_i,
196                                LLVMValueRef coord_f,
197                                LLVMValueRef length,
198                                LLVMValueRef stride,
199                                boolean is_pot,
200                                unsigned wrap_mode,
201                                LLVMValueRef *offset0,
202                                LLVMValueRef *offset1,
203                                LLVMValueRef *i0,
204                                LLVMValueRef *i1)
205{
206   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
207   LLVMBuilderRef builder = bld->gallivm->builder;
208   LLVMValueRef length_minus_one;
209   LLVMValueRef lmask, umask, mask;
210
211   /*
212    * If the pixel block covers more than one pixel then there is no easy
213    * way to calculate offset1 relative to offset0. Instead, compute them
214    * independently. Otherwise, try to compute offset0 and offset1 with
215    * a single stride multiplication.
216    */
217
218   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
219
220   if (block_length != 1) {
221      LLVMValueRef coord1;
222      switch(wrap_mode) {
223      case PIPE_TEX_WRAP_REPEAT:
224         if (is_pot) {
225            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
226            coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
227            coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
228         }
229         else {
230            LLVMValueRef mask;
231            LLVMValueRef weight;
232            LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
233            lp_build_coord_repeat_npot_linear(bld, coord_f,
234                                              length, length_f,
235                                              &coord0, &weight);
236            mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
237                                    PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
238            coord1 = LLVMBuildAnd(builder,
239                                  lp_build_add(int_coord_bld, coord0,
240                                               int_coord_bld->one),
241                                  mask, "");
242            weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
243            *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
244         }
245         break;
246
247      case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
248         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
249         coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
250                                length_minus_one);
251         coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
252                                length_minus_one);
253         break;
254
255      case PIPE_TEX_WRAP_CLAMP:
256      case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
257      case PIPE_TEX_WRAP_MIRROR_REPEAT:
258      case PIPE_TEX_WRAP_MIRROR_CLAMP:
259      case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
260      case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
261      default:
262         assert(0);
263         coord0 = int_coord_bld->zero;
264         coord1 = int_coord_bld->zero;
265         break;
266      }
267      lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
268                                     offset0, i0);
269      lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
270                                     offset1, i1);
271      return;
272   }
273
274   *i0 = int_coord_bld->zero;
275   *i1 = int_coord_bld->zero;
276
277   switch(wrap_mode) {
278   case PIPE_TEX_WRAP_REPEAT:
279      if (is_pot) {
280         coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
281      }
282      else {
283         LLVMValueRef weight;
284         LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
285         lp_build_coord_repeat_npot_linear(bld, coord_f,
286                                           length, length_f,
287                                           &coord0, &weight);
288         weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
289         *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
290      }
291
292      mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
293                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
294
295      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
296      *offset1 = LLVMBuildAnd(builder,
297                              lp_build_add(int_coord_bld, *offset0, stride),
298                              mask, "");
299      break;
300
301   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
302      /* XXX this might be slower than the separate path
303       * on some newer cpus. With sse41 this is 8 instructions vs. 7
304       * - at least on SNB this is almost certainly slower since
305       * min/max are cheaper than selects, and the muls aren't bad.
306       */
307      lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
308                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
309      umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
310                               PIPE_FUNC_LESS, coord0, length_minus_one);
311
312      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
313      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
314
315      mask = LLVMBuildAnd(builder, lmask, umask, "");
316
317      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
318      *offset1 = lp_build_add(int_coord_bld,
319                              *offset0,
320                              LLVMBuildAnd(builder, stride, mask, ""));
321      break;
322
323   case PIPE_TEX_WRAP_CLAMP:
324   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
325   case PIPE_TEX_WRAP_MIRROR_REPEAT:
326   case PIPE_TEX_WRAP_MIRROR_CLAMP:
327   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
328   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
329   default:
330      assert(0);
331      *offset0 = int_coord_bld->zero;
332      *offset1 = int_coord_bld->zero;
333      break;
334   }
335}
336
337
338/**
339 * Build LLVM code for texture coord wrapping, for linear filtering,
340 * for float texcoords.
341 * \param block_length  is the length of the pixel block along the
342 *                      coordinate axis
343 * \param coord  the incoming texcoord (s,t,r or q)
344 * \param length  the texture size along one dimension
345 * \param is_pot  if TRUE, length is a power of two
346 * \param wrap_mode  one of PIPE_TEX_WRAP_x
347 * \param coord0  the first texcoord after wrapping, as int
348 * \param coord1  the second texcoord after wrapping, as int
349 * \param weight  the filter weight as int (0-255)
350 * \param force_nearest  if this coord actually uses nearest filtering
351 */
352static void
353lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
354                                  unsigned block_length,
355                                  LLVMValueRef coord,
356                                  LLVMValueRef length,
357                                  boolean is_pot,
358                                  unsigned wrap_mode,
359                                  LLVMValueRef *coord0,
360                                  LLVMValueRef *coord1,
361                                  LLVMValueRef *weight,
362                                  unsigned force_nearest)
363{
364   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
365   struct lp_build_context *coord_bld = &bld->coord_bld;
366   LLVMBuilderRef builder = bld->gallivm->builder;
367   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
368   LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
369
370   switch(wrap_mode) {
371   case PIPE_TEX_WRAP_REPEAT:
372      if (is_pot) {
373         /* mul by size and subtract 0.5 */
374         coord = lp_build_mul(coord_bld, coord, length);
375         if (!force_nearest)
376            coord = lp_build_sub(coord_bld, coord, half);
377         *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
378         /* convert to int, compute lerp weight */
379         lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
380         *coord1 = lp_build_ifloor(coord_bld, *coord1);
381         /* repeat wrap */
382         length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
383         *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
384         *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
385      }
386      else {
387         LLVMValueRef mask;
388         /* wrap with normalized floats is just fract */
389         coord = lp_build_fract(coord_bld, coord);
390         /* unnormalize */
391         coord = lp_build_mul(coord_bld, coord, length);
392         /*
393          * we avoided the 0.5/length division, have to fix up wrong
394          * edge cases with selects
395          */
396         *coord1 = lp_build_add(coord_bld, coord, half);
397         coord = lp_build_sub(coord_bld, coord, half);
398         *weight = lp_build_fract(coord_bld, coord);
399         mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
400                                 PIPE_FUNC_LESS, coord, coord_bld->zero);
401         *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
402         *coord0 = lp_build_itrunc(coord_bld, *coord0);
403         mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
404                                 PIPE_FUNC_LESS, *coord1, length);
405         *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
406         *coord1 = lp_build_itrunc(coord_bld, *coord1);
407      }
408      break;
409   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
410      if (bld->static_state->normalized_coords) {
411         /* mul by tex size */
412         coord = lp_build_mul(coord_bld, coord, length);
413      }
414      /* subtract 0.5 */
415      if (!force_nearest) {
416         coord = lp_build_sub(coord_bld, coord, half);
417      }
418      /* clamp to [0, length - 1] */
419      coord = lp_build_min(coord_bld, coord, length_minus_one);
420      coord = lp_build_max(coord_bld, coord, coord_bld->zero);
421      *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
422      /* convert to int, compute lerp weight */
423      lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
424      /* coord1 = min(coord1, length-1) */
425      *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
426      *coord1 = lp_build_itrunc(coord_bld, *coord1);
427      break;
428   default:
429      assert(0);
430      *coord0 = int_coord_bld->zero;
431      *coord1 = int_coord_bld->zero;
432      *weight = coord_bld->zero;
433      break;
434   }
435   *weight = lp_build_mul_imm(coord_bld, *weight, 256);
436   *weight = lp_build_itrunc(coord_bld, *weight);
437   return;
438}
439
440
441/**
442 * Fetch texels for image with nearest sampling.
443 * Return filtered color as two vectors of 16-bit fixed point values.
444 */
445static void
446lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
447                                    LLVMValueRef data_ptr,
448                                    LLVMValueRef offset,
449                                    LLVMValueRef x_subcoord,
450                                    LLVMValueRef y_subcoord,
451                                    LLVMValueRef *colors_lo,
452                                    LLVMValueRef *colors_hi)
453{
454   /*
455    * Fetch the pixels as 4 x 32bit (rgba order might differ):
456    *
457    *   rgba0 rgba1 rgba2 rgba3
458    *
459    * bit cast them into 16 x u8
460    *
461    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
462    *
463    * unpack them into two 8 x i16:
464    *
465    *   r0 g0 b0 a0 r1 g1 b1 a1
466    *   r2 g2 b2 a2 r3 g3 b3 a3
467    *
468    * The higher 8 bits of the resulting elements will be zero.
469    */
470   LLVMBuilderRef builder = bld->gallivm->builder;
471   LLVMValueRef rgba8;
472   struct lp_build_context h16, u8n;
473   LLVMTypeRef u8n_vec_type;
474
475   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
476   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
477   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
478
479   if (util_format_is_rgba8_variant(bld->format_desc)) {
480      /*
481       * Given the format is a rgba8, just read the pixels as is,
482       * without any swizzling. Swizzling will be done later.
483       */
484      rgba8 = lp_build_gather(bld->gallivm,
485                              bld->texel_type.length,
486                              bld->format_desc->block.bits,
487                              bld->texel_type.width,
488                              data_ptr, offset);
489
490      rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
491   }
492   else {
493      rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
494                                      bld->format_desc,
495                                      u8n.type,
496                                      data_ptr, offset,
497                                      x_subcoord,
498                                      y_subcoord);
499   }
500
501   /* Expand one 4*rgba8 to two 2*rgba16 */
502   lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
503                    rgba8,
504                    colors_lo, colors_hi);
505}
506
507
508/**
509 * Sample a single texture image with nearest sampling.
510 * If sampling a cube texture, r = cube face in [0,5].
511 * Return filtered color as two vectors of 16-bit fixed point values.
512 */
513static void
514lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
515                              LLVMValueRef int_size,
516                              LLVMValueRef row_stride_vec,
517                              LLVMValueRef img_stride_vec,
518                              LLVMValueRef data_ptr,
519                              LLVMValueRef s,
520                              LLVMValueRef t,
521                              LLVMValueRef r,
522                              LLVMValueRef *colors_lo,
523                              LLVMValueRef *colors_hi)
524{
525   const unsigned dims = bld->dims;
526   LLVMBuilderRef builder = bld->gallivm->builder;
527   struct lp_build_context i32;
528   LLVMTypeRef i32_vec_type;
529   LLVMValueRef i32_c8;
530   LLVMValueRef width_vec, height_vec, depth_vec;
531   LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
532   LLVMValueRef s_float, t_float = NULL, r_float = NULL;
533   LLVMValueRef x_stride;
534   LLVMValueRef x_offset, offset;
535   LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
536
537   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
538
539   i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
540
541   lp_build_extract_image_sizes(bld,
542                                bld->int_size_type,
543                                bld->int_coord_type,
544                                int_size,
545                                &width_vec,
546                                &height_vec,
547                                &depth_vec);
548
549   s_float = s; t_float = t; r_float = r;
550
551   if (bld->static_state->normalized_coords) {
552      LLVMValueRef scaled_size;
553      LLVMValueRef flt_size;
554
555      /* scale size by 256 (8 fractional bits) */
556      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
557
558      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
559
560      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
561   }
562   else {
563      /* scale coords by 256 (8 fractional bits) */
564      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
565      if (dims >= 2)
566         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
567      if (dims >= 3)
568         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
569   }
570
571   /* convert float to int */
572   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
573   if (dims >= 2)
574      t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
575   if (dims >= 3)
576      r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
577
578   /* compute floor (shift right 8) */
579   i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
580   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
581   if (dims >= 2)
582      t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
583   if (dims >= 3)
584      r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
585
586   /* get pixel, row, image strides */
587   x_stride = lp_build_const_vec(bld->gallivm,
588                                 bld->int_coord_bld.type,
589                                 bld->format_desc->block.bits/8);
590
591   /* Do texcoord wrapping, compute texel offset */
592   lp_build_sample_wrap_nearest_int(bld,
593                                    bld->format_desc->block.width,
594                                    s_ipart, s_float,
595                                    width_vec, x_stride,
596                                    bld->static_state->pot_width,
597                                    bld->static_state->wrap_s,
598                                    &x_offset, &x_subcoord);
599   offset = x_offset;
600   if (dims >= 2) {
601      LLVMValueRef y_offset;
602      lp_build_sample_wrap_nearest_int(bld,
603                                       bld->format_desc->block.height,
604                                       t_ipart, t_float,
605                                       height_vec, row_stride_vec,
606                                       bld->static_state->pot_height,
607                                       bld->static_state->wrap_t,
608                                       &y_offset, &y_subcoord);
609      offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
610      if (dims >= 3) {
611         LLVMValueRef z_offset;
612         lp_build_sample_wrap_nearest_int(bld,
613                                          1, /* block length (depth) */
614                                          r_ipart, r_float,
615                                          depth_vec, img_stride_vec,
616                                          bld->static_state->pot_depth,
617                                          bld->static_state->wrap_r,
618                                          &z_offset, &z_subcoord);
619         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
620      }
621      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
622         LLVMValueRef z_offset;
623         /* The r coord is the cube face in [0,5] */
624         z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
625         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
626      }
627   }
628
629   lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
630                                       x_subcoord, y_subcoord,
631                                       colors_lo, colors_hi);
632}
633
634
635/**
636 * Sample a single texture image with nearest sampling.
637 * If sampling a cube texture, r = cube face in [0,5].
638 * Return filtered color as two vectors of 16-bit fixed point values.
639 * Does address calcs (except offsets) with floats.
640 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
641 */
642static void
643lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
644                                     LLVMValueRef int_size,
645                                     LLVMValueRef row_stride_vec,
646                                     LLVMValueRef img_stride_vec,
647                                     LLVMValueRef data_ptr,
648                                     LLVMValueRef s,
649                                     LLVMValueRef t,
650                                     LLVMValueRef r,
651                                     LLVMValueRef *colors_lo,
652                                     LLVMValueRef *colors_hi)
653   {
654   const unsigned dims = bld->dims;
655   LLVMValueRef width_vec, height_vec, depth_vec;
656   LLVMValueRef offset;
657   LLVMValueRef x_subcoord, y_subcoord;
658   LLVMValueRef x_icoord, y_icoord, z_icoord;
659   LLVMValueRef flt_size;
660
661   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
662
663   lp_build_extract_image_sizes(bld,
664                                bld->float_size_type,
665                                bld->coord_type,
666                                flt_size,
667                                &width_vec,
668                                &height_vec,
669                                &depth_vec);
670
671   /* Do texcoord wrapping */
672   lp_build_sample_wrap_nearest_float(bld,
673                                      s, width_vec,
674                                      bld->static_state->pot_width,
675                                      bld->static_state->wrap_s,
676                                      &x_icoord);
677
678   if (dims >= 2) {
679      lp_build_sample_wrap_nearest_float(bld,
680                                         t, height_vec,
681                                         bld->static_state->pot_height,
682                                         bld->static_state->wrap_t,
683                                         &y_icoord);
684
685      if (dims >= 3) {
686         lp_build_sample_wrap_nearest_float(bld,
687                                            r, depth_vec,
688                                            bld->static_state->pot_depth,
689                                            bld->static_state->wrap_r,
690                                            &z_icoord);
691      }
692      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
693         z_icoord = r;
694      }
695   }
696
697   /*
698    * From here on we deal with ints, and we should split up the 256bit
699    * vectors manually for better generated code.
700    */
701
702   /*
703    * compute texel offsets -
704    * cannot do offset calc with floats, difficult for block-based formats,
705    * and not enough precision anyway.
706    */
707   lp_build_sample_offset(&bld->int_coord_bld,
708                          bld->format_desc,
709                          x_icoord, y_icoord,
710                          z_icoord,
711                          row_stride_vec, img_stride_vec,
712                          &offset,
713                          &x_subcoord, &y_subcoord);
714
715   lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
716                                       x_subcoord, y_subcoord,
717                                       colors_lo, colors_hi);
718}
719
720
721/**
722 * Fetch texels for image with linear sampling.
723 * Return filtered color as two vectors of 16-bit fixed point values.
724 */
725static void
726lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
727                                   LLVMValueRef data_ptr,
728                                   LLVMValueRef offset[2][2][2],
729                                   LLVMValueRef x_subcoord[2],
730                                   LLVMValueRef y_subcoord[2],
731                                   LLVMValueRef s_fpart,
732                                   LLVMValueRef t_fpart,
733                                   LLVMValueRef r_fpart,
734                                   LLVMValueRef *colors_lo,
735                                   LLVMValueRef *colors_hi)
736{
737   const unsigned dims = bld->dims;
738   LLVMBuilderRef builder = bld->gallivm->builder;
739   struct lp_build_context h16, u8n;
740   LLVMTypeRef h16_vec_type, u8n_vec_type;
741   LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
742   LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
743   LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
744   LLVMValueRef shuffle_lo, shuffle_hi;
745   LLVMValueRef s_fpart_lo, s_fpart_hi;
746   LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
747   LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
748   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
749   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
750   LLVMValueRef packed_lo, packed_hi;
751   unsigned i, j, k;
752   unsigned numj, numk;
753
754   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
755   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
756   h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
757   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
758
759   /*
760    * Transform 4 x i32 in
761    *
762    *   s_fpart = {s0, s1, s2, s3}
763    *
764    * into 8 x i16
765    *
766    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
767    *
768    * into two 8 x i16
769    *
770    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
771    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
772    *
773    * and likewise for t_fpart. There is no risk of loosing precision here
774    * since the fractional parts only use the lower 8bits.
775    */
776   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
777   if (dims >= 2)
778      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
779   if (dims >= 3)
780      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
781
782   for (j = 0; j < h16.type.length; j += 4) {
783#ifdef PIPE_ARCH_LITTLE_ENDIAN
784      unsigned subindex = 0;
785#else
786      unsigned subindex = 1;
787#endif
788      LLVMValueRef index;
789
790      index = LLVMConstInt(elem_type, j/2 + subindex, 0);
791      for (i = 0; i < 4; ++i)
792         shuffles_lo[j + i] = index;
793
794      index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
795      for (i = 0; i < 4; ++i)
796         shuffles_hi[j + i] = index;
797   }
798
799   shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
800   shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
801
802   s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
803                                       shuffle_lo, "");
804   s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
805                                       shuffle_hi, "");
806   if (dims >= 2) {
807      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
808                                          shuffle_lo, "");
809      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
810                                          shuffle_hi, "");
811   }
812   if (dims >= 3) {
813      r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
814                                          shuffle_lo, "");
815      r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
816                                          shuffle_hi, "");
817   }
818
819   /*
820    * Fetch the pixels as 4 x 32bit (rgba order might differ):
821    *
822    *   rgba0 rgba1 rgba2 rgba3
823    *
824    * bit cast them into 16 x u8
825    *
826    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
827    *
828    * unpack them into two 8 x i16:
829    *
830    *   r0 g0 b0 a0 r1 g1 b1 a1
831    *   r2 g2 b2 a2 r3 g3 b3 a3
832    *
833    * The higher 8 bits of the resulting elements will be zero.
834    */
835   numj = 1 + (dims >= 2);
836   numk = 1 + (dims >= 3);
837
838   for (k = 0; k < numk; k++) {
839      for (j = 0; j < numj; j++) {
840         for (i = 0; i < 2; i++) {
841            LLVMValueRef rgba8;
842
843            if (util_format_is_rgba8_variant(bld->format_desc)) {
844               /*
845                * Given the format is a rgba8, just read the pixels as is,
846                * without any swizzling. Swizzling will be done later.
847                */
848               rgba8 = lp_build_gather(bld->gallivm,
849                                       bld->texel_type.length,
850                                       bld->format_desc->block.bits,
851                                       bld->texel_type.width,
852                                       data_ptr, offset[k][j][i]);
853
854               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
855            }
856            else {
857               rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
858                                               bld->format_desc,
859                                               u8n.type,
860                                               data_ptr, offset[k][j][i],
861                                               x_subcoord[i],
862                                               y_subcoord[j]);
863            }
864
865            /* Expand one 4*rgba8 to two 2*rgba16 */
866            lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
867                             rgba8,
868                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
869         }
870      }
871   }
872
873   /*
874    * Linear interpolation with 8.8 fixed point.
875    */
876   if (bld->static_state->force_nearest_s) {
877      /* special case 1-D lerp */
878      packed_lo = lp_build_lerp(&h16,
879                                t_fpart_lo,
880                                neighbors_lo[0][0][0],
881                                neighbors_lo[0][0][1]);
882
883      packed_hi = lp_build_lerp(&h16,
884                                t_fpart_hi,
885                                neighbors_hi[0][1][0],
886                                neighbors_hi[0][1][0]);
887   }
888   else if (bld->static_state->force_nearest_t) {
889      /* special case 1-D lerp */
890      packed_lo = lp_build_lerp(&h16,
891                                s_fpart_lo,
892                                neighbors_lo[0][0][0],
893                                neighbors_lo[0][0][1]);
894
895      packed_hi = lp_build_lerp(&h16,
896                                s_fpart_hi,
897                                neighbors_hi[0][0][0],
898                                neighbors_hi[0][0][1]);
899   }
900   else {
901      /* general 1/2/3-D lerping */
902      if (dims == 1) {
903         packed_lo = lp_build_lerp(&h16,
904                                   s_fpart_lo,
905                                   neighbors_lo[0][0][0],
906                                   neighbors_lo[0][0][1]);
907
908         packed_hi = lp_build_lerp(&h16,
909                                   s_fpart_hi,
910                                   neighbors_hi[0][0][0],
911                                   neighbors_hi[0][0][1]);
912      }
913      else {
914         /* 2-D lerp */
915         packed_lo = lp_build_lerp_2d(&h16,
916                                      s_fpart_lo, t_fpart_lo,
917                                      neighbors_lo[0][0][0],
918                                      neighbors_lo[0][0][1],
919                                      neighbors_lo[0][1][0],
920                                      neighbors_lo[0][1][1]);
921
922         packed_hi = lp_build_lerp_2d(&h16,
923                                      s_fpart_hi, t_fpart_hi,
924                                      neighbors_hi[0][0][0],
925                                      neighbors_hi[0][0][1],
926                                      neighbors_hi[0][1][0],
927                                      neighbors_hi[0][1][1]);
928
929         if (dims >= 3) {
930            LLVMValueRef packed_lo2, packed_hi2;
931
932            /* lerp in the second z slice */
933            packed_lo2 = lp_build_lerp_2d(&h16,
934                                          s_fpart_lo, t_fpart_lo,
935                                          neighbors_lo[1][0][0],
936                                          neighbors_lo[1][0][1],
937                                          neighbors_lo[1][1][0],
938                                          neighbors_lo[1][1][1]);
939
940            packed_hi2 = lp_build_lerp_2d(&h16,
941                                          s_fpart_hi, t_fpart_hi,
942                                          neighbors_hi[1][0][0],
943                                          neighbors_hi[1][0][1],
944                                          neighbors_hi[1][1][0],
945                                          neighbors_hi[1][1][1]);
946            /* interp between two z slices */
947            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
948                                      packed_lo, packed_lo2);
949            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
950                                      packed_hi, packed_hi2);
951         }
952      }
953   }
954
955   *colors_lo = packed_lo;
956   *colors_hi = packed_hi;
957}
958
959/**
960 * Sample a single texture image with (bi-)(tri-)linear sampling.
961 * Return filtered color as two vectors of 16-bit fixed point values.
962 */
963static void
964lp_build_sample_image_linear(struct lp_build_sample_context *bld,
965                             LLVMValueRef int_size,
966                             LLVMValueRef row_stride_vec,
967                             LLVMValueRef img_stride_vec,
968                             LLVMValueRef data_ptr,
969                             LLVMValueRef s,
970                             LLVMValueRef t,
971                             LLVMValueRef r,
972                             LLVMValueRef *colors_lo,
973                             LLVMValueRef *colors_hi)
974{
975   const unsigned dims = bld->dims;
976   LLVMBuilderRef builder = bld->gallivm->builder;
977   struct lp_build_context i32;
978   LLVMTypeRef i32_vec_type;
979   LLVMValueRef i32_c8, i32_c128, i32_c255;
980   LLVMValueRef width_vec, height_vec, depth_vec;
981   LLVMValueRef s_ipart, s_fpart, s_float;
982   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
983   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
984   LLVMValueRef x_stride, y_stride, z_stride;
985   LLVMValueRef x_offset0, x_offset1;
986   LLVMValueRef y_offset0, y_offset1;
987   LLVMValueRef z_offset0, z_offset1;
988   LLVMValueRef offset[2][2][2]; /* [z][y][x] */
989   LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
990   unsigned x, y, z;
991
992   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
993
994   i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
995
996   lp_build_extract_image_sizes(bld,
997                                bld->int_size_type,
998                                bld->int_coord_type,
999                                int_size,
1000                                &width_vec,
1001                                &height_vec,
1002                                &depth_vec);
1003
1004   s_float = s; t_float = t; r_float = r;
1005
1006   if (bld->static_state->normalized_coords) {
1007      LLVMValueRef scaled_size;
1008      LLVMValueRef flt_size;
1009
1010      /* scale size by 256 (8 fractional bits) */
1011      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
1012
1013      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
1014
1015      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
1016   }
1017   else {
1018      /* scale coords by 256 (8 fractional bits) */
1019      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
1020      if (dims >= 2)
1021         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
1022      if (dims >= 3)
1023         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
1024   }
1025
1026   /* convert float to int */
1027   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
1028   if (dims >= 2)
1029      t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
1030   if (dims >= 3)
1031      r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
1032
1033   /* subtract 0.5 (add -128) */
1034   i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128);
1035   if (!bld->static_state->force_nearest_s) {
1036      s = LLVMBuildAdd(builder, s, i32_c128, "");
1037   }
1038   if (dims >= 2 && !bld->static_state->force_nearest_t) {
1039      t = LLVMBuildAdd(builder, t, i32_c128, "");
1040   }
1041   if (dims >= 3) {
1042      r = LLVMBuildAdd(builder, r, i32_c128, "");
1043   }
1044
1045   /* compute floor (shift right 8) */
1046   i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
1047   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
1048   if (dims >= 2)
1049      t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
1050   if (dims >= 3)
1051      r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
1052
1053   /* compute fractional part (AND with 0xff) */
1054   i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255);
1055   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
1056   if (dims >= 2)
1057      t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
1058   if (dims >= 3)
1059      r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
1060
1061   /* get pixel, row and image strides */
1062   x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type,
1063                                 bld->format_desc->block.bits/8);
1064   y_stride = row_stride_vec;
1065   z_stride = img_stride_vec;
1066
1067   /* do texcoord wrapping and compute texel offsets */
1068   lp_build_sample_wrap_linear_int(bld,
1069                                   bld->format_desc->block.width,
1070                                   s_ipart, &s_fpart, s_float,
1071                                   width_vec, x_stride,
1072                                   bld->static_state->pot_width,
1073                                   bld->static_state->wrap_s,
1074                                   &x_offset0, &x_offset1,
1075                                   &x_subcoord[0], &x_subcoord[1]);
1076   for (z = 0; z < 2; z++) {
1077      for (y = 0; y < 2; y++) {
1078         offset[z][y][0] = x_offset0;
1079         offset[z][y][1] = x_offset1;
1080      }
1081   }
1082
1083   if (dims >= 2) {
1084      lp_build_sample_wrap_linear_int(bld,
1085                                      bld->format_desc->block.height,
1086                                      t_ipart, &t_fpart, t_float,
1087                                      height_vec, y_stride,
1088                                      bld->static_state->pot_height,
1089                                      bld->static_state->wrap_t,
1090                                      &y_offset0, &y_offset1,
1091                                      &y_subcoord[0], &y_subcoord[1]);
1092
1093      for (z = 0; z < 2; z++) {
1094         for (x = 0; x < 2; x++) {
1095            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1096                                           offset[z][0][x], y_offset0);
1097            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1098                                           offset[z][1][x], y_offset1);
1099         }
1100      }
1101   }
1102
1103   if (dims >= 3) {
1104      lp_build_sample_wrap_linear_int(bld,
1105                                      bld->format_desc->block.height,
1106                                      r_ipart, &r_fpart, r_float,
1107                                      depth_vec, z_stride,
1108                                      bld->static_state->pot_depth,
1109                                      bld->static_state->wrap_r,
1110                                      &z_offset0, &z_offset1,
1111                                      &z_subcoord[0], &z_subcoord[1]);
1112      for (y = 0; y < 2; y++) {
1113         for (x = 0; x < 2; x++) {
1114            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1115                                           offset[0][y][x], z_offset0);
1116            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1117                                           offset[1][y][x], z_offset1);
1118         }
1119      }
1120   }
1121   else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
1122      LLVMValueRef z_offset;
1123      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1124      for (y = 0; y < 2; y++) {
1125         for (x = 0; x < 2; x++) {
1126            /* The r coord is the cube face in [0,5] */
1127            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1128                                           offset[0][y][x], z_offset);
1129         }
1130      }
1131   }
1132
1133   lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1134                                      x_subcoord, y_subcoord,
1135                                      s_fpart, t_fpart, r_fpart,
1136                                      colors_lo, colors_hi);
1137}
1138
1139
1140/**
1141 * Sample a single texture image with (bi-)(tri-)linear sampling.
1142 * Return filtered color as two vectors of 16-bit fixed point values.
1143 * Does address calcs (except offsets) with floats.
1144 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
1145 */
1146static void
1147lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
1148                                    LLVMValueRef int_size,
1149                                    LLVMValueRef row_stride_vec,
1150                                    LLVMValueRef img_stride_vec,
1151                                    LLVMValueRef data_ptr,
1152                                    LLVMValueRef s,
1153                                    LLVMValueRef t,
1154                                    LLVMValueRef r,
1155                                    LLVMValueRef *colors_lo,
1156                                    LLVMValueRef *colors_hi)
1157{
1158   const unsigned dims = bld->dims;
1159   LLVMValueRef width_vec, height_vec, depth_vec;
1160   LLVMValueRef s_fpart;
1161   LLVMValueRef t_fpart = NULL;
1162   LLVMValueRef r_fpart = NULL;
1163   LLVMValueRef x_stride, y_stride, z_stride;
1164   LLVMValueRef x_offset0, x_offset1;
1165   LLVMValueRef y_offset0, y_offset1;
1166   LLVMValueRef z_offset0, z_offset1;
1167   LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1168   LLVMValueRef x_subcoord[2], y_subcoord[2];
1169   LLVMValueRef flt_size;
1170   LLVMValueRef x_icoord0, x_icoord1;
1171   LLVMValueRef y_icoord0, y_icoord1;
1172   LLVMValueRef z_icoord0, z_icoord1;
1173   unsigned x, y, z;
1174
1175   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
1176
1177   lp_build_extract_image_sizes(bld,
1178                                bld->float_size_type,
1179                                bld->coord_type,
1180                                flt_size,
1181                                &width_vec,
1182                                &height_vec,
1183                                &depth_vec);
1184
1185   /* do texcoord wrapping and compute texel offsets */
1186   lp_build_sample_wrap_linear_float(bld,
1187                                     bld->format_desc->block.width,
1188                                     s, width_vec,
1189                                     bld->static_state->pot_width,
1190                                     bld->static_state->wrap_s,
1191                                     &x_icoord0, &x_icoord1,
1192                                     &s_fpart,
1193                                     bld->static_state->force_nearest_s);
1194
1195   if (dims >= 2) {
1196      lp_build_sample_wrap_linear_float(bld,
1197                                        bld->format_desc->block.height,
1198                                        t, height_vec,
1199                                        bld->static_state->pot_height,
1200                                        bld->static_state->wrap_t,
1201                                        &y_icoord0, &y_icoord1,
1202                                        &t_fpart,
1203                                        bld->static_state->force_nearest_t);
1204
1205      if (dims >= 3) {
1206         lp_build_sample_wrap_linear_float(bld,
1207                                           bld->format_desc->block.height,
1208                                           r, depth_vec,
1209                                           bld->static_state->pot_depth,
1210                                           bld->static_state->wrap_r,
1211                                           &z_icoord0, &z_icoord1,
1212                                           &r_fpart, 0);
1213      }
1214   }
1215
1216   /*
1217    * From here on we deal with ints, and we should split up the 256bit
1218    * vectors manually for better generated code.
1219    */
1220
1221   /* get pixel, row and image strides */
1222   x_stride = lp_build_const_vec(bld->gallivm,
1223                                 bld->int_coord_bld.type,
1224                                 bld->format_desc->block.bits/8);
1225   y_stride = row_stride_vec;
1226   z_stride = img_stride_vec;
1227
1228   /*
1229    * compute texel offset -
1230    * cannot do offset calc with floats, difficult for block-based formats,
1231    * and not enough precision anyway.
1232    */
1233   lp_build_sample_partial_offset(&bld->int_coord_bld,
1234                                  bld->format_desc->block.width,
1235                                  x_icoord0, x_stride,
1236                                  &x_offset0, &x_subcoord[0]);
1237   lp_build_sample_partial_offset(&bld->int_coord_bld,
1238                                  bld->format_desc->block.width,
1239                                  x_icoord1, x_stride,
1240                                  &x_offset1, &x_subcoord[1]);
1241   for (z = 0; z < 2; z++) {
1242      for (y = 0; y < 2; y++) {
1243         offset[z][y][0] = x_offset0;
1244         offset[z][y][1] = x_offset1;
1245      }
1246   }
1247
1248   if (dims >= 2) {
1249      lp_build_sample_partial_offset(&bld->int_coord_bld,
1250                                     bld->format_desc->block.height,
1251                                     y_icoord0, y_stride,
1252                                     &y_offset0, &y_subcoord[0]);
1253      lp_build_sample_partial_offset(&bld->int_coord_bld,
1254                                     bld->format_desc->block.height,
1255                                     y_icoord1, y_stride,
1256                                     &y_offset1, &y_subcoord[1]);
1257      for (z = 0; z < 2; z++) {
1258         for (x = 0; x < 2; x++) {
1259            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1260                                           offset[z][0][x], y_offset0);
1261            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1262                                           offset[z][1][x], y_offset1);
1263         }
1264      }
1265   }
1266
1267   if (dims >= 3) {
1268      LLVMValueRef z_subcoord[2];
1269      lp_build_sample_partial_offset(&bld->int_coord_bld,
1270                                     1,
1271                                     z_icoord0, z_stride,
1272                                     &z_offset0, &z_subcoord[0]);
1273      lp_build_sample_partial_offset(&bld->int_coord_bld,
1274                                     1,
1275                                     z_icoord1, z_stride,
1276                                     &z_offset1, &z_subcoord[1]);
1277      for (y = 0; y < 2; y++) {
1278         for (x = 0; x < 2; x++) {
1279            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1280                                           offset[0][y][x], z_offset0);
1281            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1282                                           offset[1][y][x], z_offset1);
1283         }
1284      }
1285   }
1286   else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
1287      LLVMValueRef z_offset;
1288      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1289      for (y = 0; y < 2; y++) {
1290         for (x = 0; x < 2; x++) {
1291            /* The r coord is the cube face in [0,5] */
1292            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1293                                           offset[0][y][x], z_offset);
1294         }
1295      }
1296   }
1297
1298   lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1299                                      x_subcoord, y_subcoord,
1300                                      s_fpart, t_fpart, r_fpart,
1301                                      colors_lo, colors_hi);
1302}
1303
1304
1305/**
1306 * Sample the texture/mipmap using given image filter and mip filter.
1307 * data0_ptr and data1_ptr point to the two mipmap levels to sample
1308 * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
1309 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1310 */
1311static void
1312lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1313                       unsigned img_filter,
1314                       unsigned mip_filter,
1315                       LLVMValueRef s,
1316                       LLVMValueRef t,
1317                       LLVMValueRef r,
1318                       LLVMValueRef ilevel0,
1319                       LLVMValueRef ilevel1,
1320                       LLVMValueRef lod_fpart,
1321                       LLVMValueRef colors_lo_var,
1322                       LLVMValueRef colors_hi_var)
1323{
1324   LLVMBuilderRef builder = bld->gallivm->builder;
1325   LLVMValueRef size0;
1326   LLVMValueRef size1;
1327   LLVMValueRef row_stride0_vec = NULL;
1328   LLVMValueRef row_stride1_vec = NULL;
1329   LLVMValueRef img_stride0_vec = NULL;
1330   LLVMValueRef img_stride1_vec = NULL;
1331   LLVMValueRef data_ptr0;
1332   LLVMValueRef data_ptr1;
1333   LLVMValueRef colors0_lo, colors0_hi;
1334   LLVMValueRef colors1_lo, colors1_hi;
1335
1336   /* sample the first mipmap level */
1337   lp_build_mipmap_level_sizes(bld, ilevel0,
1338                               &size0,
1339                               &row_stride0_vec, &img_stride0_vec);
1340   data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1341   if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1342      if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1343         lp_build_sample_image_nearest_afloat(bld,
1344                                              size0,
1345                                              row_stride0_vec, img_stride0_vec,
1346                                              data_ptr0, s, t, r,
1347                                              &colors0_lo, &colors0_hi);
1348      }
1349      else {
1350         assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1351         lp_build_sample_image_linear_afloat(bld,
1352                                             size0,
1353                                             row_stride0_vec, img_stride0_vec,
1354                                             data_ptr0, s, t, r,
1355                                             &colors0_lo, &colors0_hi);
1356      }
1357   }
1358   else {
1359      if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1360         lp_build_sample_image_nearest(bld,
1361                                       size0,
1362                                       row_stride0_vec, img_stride0_vec,
1363                                       data_ptr0, s, t, r,
1364                                       &colors0_lo, &colors0_hi);
1365      }
1366      else {
1367         assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1368         lp_build_sample_image_linear(bld,
1369                                      size0,
1370                                      row_stride0_vec, img_stride0_vec,
1371                                      data_ptr0, s, t, r,
1372                                      &colors0_lo, &colors0_hi);
1373      }
1374   }
1375
1376   /* Store the first level's colors in the output variables */
1377   LLVMBuildStore(builder, colors0_lo, colors_lo_var);
1378   LLVMBuildStore(builder, colors0_hi, colors_hi_var);
1379
1380   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1381      LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
1382                                                     bld->perquadf_bld.type, 256.0);
1383      LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
1384      struct lp_build_if_state if_ctx;
1385      LLVMValueRef need_lerp;
1386      unsigned num_quads = bld->coord_bld.type.length / 4;
1387      unsigned i;
1388
1389      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
1390      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
1391
1392      /* need_lerp = lod_fpart > 0 */
1393      if (num_quads == 1) {
1394         need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
1395                                   lod_fpart, bld->perquadi_bld.zero,
1396                                   "need_lerp");
1397      }
1398      else {
1399         /*
1400          * We'll do mip filtering if any of the quads need it.
1401          * It might be better to split the vectors here and only fetch/filter
1402          * quads which need it.
1403          */
1404         /*
1405          * We need to clamp lod_fpart here since we can get negative
1406          * values which would screw up filtering if not all
1407          * lod_fpart values have same sign.
1408          * We can however then skip the greater than comparison.
1409          */
1410         lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
1411                                  bld->perquadi_bld.zero);
1412         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
1413      }
1414
1415      lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1416      {
1417         struct lp_build_context h16_bld;
1418
1419         lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
1420
1421         /* sample the second mipmap level */
1422         lp_build_mipmap_level_sizes(bld, ilevel1,
1423                                     &size1,
1424                                     &row_stride1_vec, &img_stride1_vec);
1425         data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1426
1427         if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1428            if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1429               lp_build_sample_image_nearest_afloat(bld,
1430                                                    size1,
1431                                                    row_stride1_vec, img_stride1_vec,
1432                                                    data_ptr1, s, t, r,
1433                                                    &colors1_lo, &colors1_hi);
1434            }
1435            else {
1436               lp_build_sample_image_linear_afloat(bld,
1437                                                   size1,
1438                                                   row_stride1_vec, img_stride1_vec,
1439                                                   data_ptr1, s, t, r,
1440                                                   &colors1_lo, &colors1_hi);
1441            }
1442         }
1443         else {
1444            if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1445               lp_build_sample_image_nearest(bld,
1446                                             size1,
1447                                             row_stride1_vec, img_stride1_vec,
1448                                             data_ptr1, s, t, r,
1449                                             &colors1_lo, &colors1_hi);
1450            }
1451            else {
1452               lp_build_sample_image_linear(bld,
1453                                            size1,
1454                                            row_stride1_vec, img_stride1_vec,
1455                                            data_ptr1, s, t, r,
1456                                            &colors1_lo, &colors1_hi);
1457            }
1458         }
1459
1460         /* interpolate samples from the two mipmap levels */
1461
1462         if (num_quads == 1) {
1463            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
1464            lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
1465
1466#if HAVE_LLVM == 0x208
1467            /* This is a work-around for a bug in LLVM 2.8.
1468             * Evidently, something goes wrong in the construction of the
1469             * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
1470             * to force the vector to be properly constructed.
1471             * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
1472             */
1473            {
1474               LLVMValueRef shuffles[8], shuffle;
1475               assert(h16_bld.type.length <= Elements(shuffles));
1476               for (i = 0; i < h16_bld.type.length; i++)
1477                  shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
1478               shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
1479               lod_fpart = LLVMBuildShuffleVector(builder,
1480                                                  lod_fpart, lod_fpart,
1481                                                  shuffle, "");
1482            }
1483#endif
1484
1485            colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
1486                                       colors0_lo, colors1_lo);
1487            colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
1488                                       colors0_hi, colors1_hi);
1489         }
1490         else {
1491            LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
1492            struct lp_type perquadi16_type = bld->perquadi_bld.type;
1493            perquadi16_type.width /= 2;
1494            perquadi16_type.length *= 2;
1495            lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
1496                                         lp_build_vec_type(bld->gallivm,
1497                                                           perquadi16_type), "");
1498            /* XXX this only works for exactly 2 quads. More quads need shuffle */
1499            assert(num_quads == 2);
1500            for (i = 0; i < num_quads; i++) {
1501               LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
1502               lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
1503                                                         perquadi16_type,
1504                                                         h16_bld.type,
1505                                                         lod_fpart,
1506                                                         indexi2);
1507            }
1508            colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
1509                                       colors0_lo, colors1_lo);
1510            colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
1511                                       colors0_hi, colors1_hi);
1512         }
1513
1514         LLVMBuildStore(builder, colors0_lo, colors_lo_var);
1515         LLVMBuildStore(builder, colors0_hi, colors_hi_var);
1516      }
1517      lp_build_endif(&if_ctx);
1518   }
1519}
1520
1521
1522
1523/**
1524 * Texture sampling in AoS format.  Used when sampling common 32-bit/texel
1525 * formats.  1D/2D/3D/cube texture supported.  All mipmap sampling modes
1526 * but only limited texture coord wrap modes.
1527 */
1528void
1529lp_build_sample_aos(struct lp_build_sample_context *bld,
1530                    unsigned unit,
1531                    LLVMValueRef s,
1532                    LLVMValueRef t,
1533                    LLVMValueRef r,
1534                    LLVMValueRef lod_ipart,
1535                    LLVMValueRef lod_fpart,
1536                    LLVMValueRef ilevel0,
1537                    LLVMValueRef ilevel1,
1538                    LLVMValueRef texel_out[4])
1539{
1540   struct lp_build_context *int_bld = &bld->int_bld;
1541   LLVMBuilderRef builder = bld->gallivm->builder;
1542   const unsigned mip_filter = bld->static_state->min_mip_filter;
1543   const unsigned min_filter = bld->static_state->min_img_filter;
1544   const unsigned mag_filter = bld->static_state->mag_img_filter;
1545   const unsigned dims = bld->dims;
1546   LLVMValueRef packed, packed_lo, packed_hi;
1547   LLVMValueRef unswizzled[4];
1548   struct lp_build_context h16_bld;
1549
1550   /* we only support the common/simple wrap modes at this time */
1551   assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
1552   if (dims >= 2)
1553      assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
1554   if (dims >= 3)
1555      assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
1556
1557
1558   /* make 16-bit fixed-pt builder context */
1559   lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
1560
1561   /*
1562    * Get/interpolate texture colors.
1563    */
1564
1565   packed_lo = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_lo");
1566   packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
1567
1568   if (min_filter == mag_filter) {
1569      /* no need to distinguish between minification and magnification */
1570      lp_build_sample_mipmap(bld,
1571                             min_filter, mip_filter,
1572                             s, t, r,
1573                             ilevel0, ilevel1, lod_fpart,
1574                             packed_lo, packed_hi);
1575   }
1576   else {
1577      /* Emit conditional to choose min image filter or mag image filter
1578       * depending on the lod being > 0 or <= 0, respectively.
1579       */
1580      struct lp_build_if_state if_ctx;
1581      LLVMValueRef minify;
1582
1583      /* minify = lod >= 0.0 */
1584      minify = LLVMBuildICmp(builder, LLVMIntSGE,
1585                             lod_ipart, int_bld->zero, "");
1586
1587      lp_build_if(&if_ctx, bld->gallivm, minify);
1588      {
1589         /* Use the minification filter */
1590         lp_build_sample_mipmap(bld,
1591                                min_filter, mip_filter,
1592                                s, t, r,
1593                                ilevel0, ilevel1, lod_fpart,
1594                                packed_lo, packed_hi);
1595      }
1596      lp_build_else(&if_ctx);
1597      {
1598         /* Use the magnification filter */
1599         lp_build_sample_mipmap(bld,
1600                                mag_filter, PIPE_TEX_MIPFILTER_NONE,
1601                                s, t, r,
1602                                ilevel0, NULL, NULL,
1603                                packed_lo, packed_hi);
1604      }
1605      lp_build_endif(&if_ctx);
1606   }
1607
1608   /*
1609    * combine the values stored in 'packed_lo' and 'packed_hi' variables
1610    * into 'packed'
1611    */
1612   packed = lp_build_pack2(bld->gallivm,
1613                           h16_bld.type, lp_type_unorm(8, bld->vector_width),
1614                           LLVMBuildLoad(builder, packed_lo, ""),
1615                           LLVMBuildLoad(builder, packed_hi, ""));
1616
1617   /*
1618    * Convert to SoA and swizzle.
1619    */
1620   lp_build_rgba8_to_f32_soa(bld->gallivm,
1621                             bld->texel_type,
1622                             packed, unswizzled);
1623
1624   if (util_format_is_rgba8_variant(bld->format_desc)) {
1625      lp_build_format_swizzle_soa(bld->format_desc,
1626                                  &bld->texel_bld,
1627                                  unswizzled, texel_out);
1628   }
1629   else {
1630      texel_out[0] = unswizzled[0];
1631      texel_out[1] = unswizzled[1];
1632      texel_out[2] = unswizzled[2];
1633      texel_out[3] = unswizzled[3];
1634   }
1635}
1636