1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29#include "pipe/p_defines.h"
30
31#include "util/u_format.h"
32#include "util/u_memory.h"
33#include "util/u_string.h"
34#include "util/u_math.h"
35
36#include "lp_bld_type.h"
37#include "lp_bld_const.h"
38#include "lp_bld_conv.h"
39#include "lp_bld_swizzle.h"
40#include "lp_bld_gather.h"
41#include "lp_bld_debug.h"
42#include "lp_bld_format.h"
43#include "lp_bld_arit.h"
44#include "lp_bld_pack.h"
45
46
47static void
48convert_to_soa(struct gallivm_state *gallivm,
49               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
50               LLVMValueRef dst_soa[4],
51               const struct lp_type soa_type)
52{
53   unsigned j, k;
54   struct lp_type aos_channel_type = soa_type;
55
56   LLVMValueRef aos_channels[4];
57   unsigned pixels_per_channel = soa_type.length / 4;
58
59   debug_assert((soa_type.length % 4) == 0);
60
61   aos_channel_type.length >>= 1;
62
63   for (j = 0; j < 4; ++j) {
64      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
65
66      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
67
68      for (k = 0; k < pixels_per_channel; ++k) {
69         channel[k] = src_aos[j + 4 * k];
70      }
71
72      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
73   }
74
75   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
76}
77
78
79void
80lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
81                            struct lp_build_context *bld,
82                            const LLVMValueRef *unswizzled,
83                            LLVMValueRef swizzled_out[4])
84{
85   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
86      enum pipe_swizzle swizzle;
87      LLVMValueRef depth_or_stencil;
88
89      if (util_format_has_stencil(format_desc) &&
90          !util_format_has_depth(format_desc)) {
91         assert(!bld->type.floating);
92         swizzle = format_desc->swizzle[1];
93      }
94      else {
95         assert(bld->type.floating);
96         swizzle = format_desc->swizzle[0];
97      }
98      /*
99       * Return zzz1 or sss1 for depth-stencil formats here.
100       * Correct swizzling will be handled by apply_sampler_swizzle() later.
101       */
102      depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
103
104      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
105      swizzled_out[3] = bld->one;
106   }
107   else {
108      unsigned chan;
109      for (chan = 0; chan < 4; ++chan) {
110         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
111         swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
112      }
113   }
114}
115
116
117
118static LLVMValueRef
119lp_build_extract_soa_chan(struct lp_build_context *bld,
120                          unsigned blockbits,
121                          boolean srgb_chan,
122                          struct util_format_channel_description chan_desc,
123                          LLVMValueRef packed)
124{
125   struct gallivm_state *gallivm = bld->gallivm;
126   LLVMBuilderRef builder = gallivm->builder;
127   struct lp_type type = bld->type;
128   LLVMValueRef input = packed;
129   const unsigned width = chan_desc.size;
130   const unsigned start = chan_desc.shift;
131   const unsigned stop = start + width;
132
133   /* Decode the input vector component */
134
135   switch(chan_desc.type) {
136   case UTIL_FORMAT_TYPE_VOID:
137      input = bld->undef;
138      break;
139
140   case UTIL_FORMAT_TYPE_UNSIGNED:
141      /*
142       * Align the LSB
143       */
144      if (start) {
145         input = LLVMBuildLShr(builder, input,
146                               lp_build_const_int_vec(gallivm, type, start), "");
147      }
148
149      /*
150       * Zero the MSBs
151       */
152      if (stop < blockbits) {
153         unsigned mask = ((unsigned long long)1 << width) - 1;
154         input = LLVMBuildAnd(builder, input,
155                              lp_build_const_int_vec(gallivm, type, mask), "");
156      }
157
158      /*
159       * Type conversion
160       */
161      if (type.floating) {
162         if (srgb_chan) {
163            struct lp_type conv_type = lp_uint_type(type);
164            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
165         }
166         else {
167            if(chan_desc.normalized)
168               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
169            else
170               input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
171         }
172      }
173      else if (chan_desc.pure_integer) {
174         /* Nothing to do */
175      } else {
176          /* FIXME */
177          assert(0);
178      }
179      break;
180
181   case UTIL_FORMAT_TYPE_SIGNED:
182      /*
183       * Align the sign bit first.
184       */
185      if (stop < type.width) {
186         unsigned bits = type.width - stop;
187         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
188         input = LLVMBuildShl(builder, input, bits_val, "");
189      }
190
191      /*
192       * Align the LSB (with an arithmetic shift to preserve the sign)
193       */
194      if (chan_desc.size < type.width) {
195         unsigned bits = type.width - chan_desc.size;
196         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
197         input = LLVMBuildAShr(builder, input, bits_val, "");
198      }
199
200      /*
201       * Type conversion
202       */
203      if (type.floating) {
204         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
205         if (chan_desc.normalized) {
206            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
207            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
208            input = LLVMBuildFMul(builder, input, scale_val, "");
209            /*
210             * The formula above will produce value below -1.0 for most negative
211             * value but everything seems happy with that hence disable for now.
212             */
213            if (0)
214               input = lp_build_max(bld, input,
215                                    lp_build_const_vec(gallivm, type, -1.0f));
216         }
217      }
218      else if (chan_desc.pure_integer) {
219         /* Nothing to do */
220      } else {
221          /* FIXME */
222          assert(0);
223      }
224      break;
225
226   case UTIL_FORMAT_TYPE_FLOAT:
227      if (type.floating) {
228         if (chan_desc.size == 16) {
229            struct lp_type f16i_type = type;
230            f16i_type.width /= 2;
231            f16i_type.floating = 0;
232            if (start) {
233               input = LLVMBuildLShr(builder, input,
234                                     lp_build_const_int_vec(gallivm, type, start), "");
235            }
236            input = LLVMBuildTrunc(builder, input,
237                                   lp_build_vec_type(gallivm, f16i_type), "");
238            input = lp_build_half_to_float(gallivm, input);
239         } else {
240            assert(start == 0);
241            assert(stop == 32);
242            assert(type.width == 32);
243         }
244         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
245      }
246      else {
247         /* FIXME */
248         assert(0);
249         input = bld->undef;
250      }
251      break;
252
253   case UTIL_FORMAT_TYPE_FIXED:
254      if (type.floating) {
255         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
256         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
257         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
258         input = LLVMBuildFMul(builder, input, scale_val, "");
259      }
260      else {
261         /* FIXME */
262         assert(0);
263         input = bld->undef;
264      }
265      break;
266
267   default:
268      assert(0);
269      input = bld->undef;
270      break;
271   }
272
273   return input;
274}
275
276
277/**
278 * Unpack several pixels in SoA.
279 *
280 * It takes a vector of packed pixels:
281 *
282 *   packed = {P0, P1, P2, P3, ..., Pn}
283 *
284 * And will produce four vectors:
285 *
286 *   red    = {R0, R1, R2, R3, ..., Rn}
287 *   green  = {G0, G1, G2, G3, ..., Gn}
288 *   blue   = {B0, B1, B2, B3, ..., Bn}
289 *   alpha  = {A0, A1, A2, A3, ..., An}
290 *
291 * It requires that a packed pixel fits into an element of the output
292 * channels. The common case is when converting pixel with a depth of 32 bit or
293 * less into floats.
294 *
295 * \param format_desc  the format of the 'packed' incoming pixel vector
296 * \param type  the desired type for rgba_out (type.length = n, above)
297 * \param packed  the incoming vector of packed pixels
298 * \param rgba_out  returns the SoA R,G,B,A vectors
299 */
300void
301lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
302                         const struct util_format_description *format_desc,
303                         struct lp_type type,
304                         LLVMValueRef packed,
305                         LLVMValueRef rgba_out[4])
306{
307   struct lp_build_context bld;
308   LLVMValueRef inputs[4];
309   unsigned chan;
310
311   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
312   assert(format_desc->block.width == 1);
313   assert(format_desc->block.height == 1);
314   assert(format_desc->block.bits <= type.width);
315   /* FIXME: Support more output types */
316   assert(type.width == 32);
317
318   lp_build_context_init(&bld, gallivm, type);
319
320   /* Decode the input vector components */
321   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
322      struct util_format_channel_description chan_desc = format_desc->channel[chan];
323      boolean srgb_chan = FALSE;
324
325      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
326          format_desc->swizzle[3] != chan) {
327         srgb_chan = TRUE;
328      }
329
330      inputs[chan] = lp_build_extract_soa_chan(&bld,
331                                               format_desc->block.bits,
332                                               srgb_chan,
333                                               chan_desc,
334                                               packed);
335   }
336
337   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
338}
339
340
341/**
342 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
343 *
344 * \param dst_type  The desired return type. For pure integer formats
345 *                  this should be a 32bit wide int or uint vector type,
346 *                  otherwise a float vector type.
347 *
348 * \param packed    The rgba8 values to pack.
349 *
350 * \param rgba      The 4 SoA return vectors.
351 */
352void
353lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
354                           struct lp_type dst_type,
355                           LLVMValueRef packed,
356                           LLVMValueRef *rgba)
357{
358   LLVMBuilderRef builder = gallivm->builder;
359   LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
360   unsigned chan;
361
362   /* XXX technically shouldn't use that for uint dst_type */
363   packed = LLVMBuildBitCast(builder, packed,
364                             lp_build_int_vec_type(gallivm, dst_type), "");
365
366   /* Decode the input vector components */
367   for (chan = 0; chan < 4; ++chan) {
368#ifdef PIPE_ARCH_LITTLE_ENDIAN
369      unsigned start = chan*8;
370#else
371      unsigned start = (3-chan)*8;
372#endif
373      unsigned stop = start + 8;
374      LLVMValueRef input;
375
376      input = packed;
377
378      if (start)
379         input = LLVMBuildLShr(builder, input,
380                               lp_build_const_int_vec(gallivm, dst_type, start), "");
381
382      if (stop < 32)
383         input = LLVMBuildAnd(builder, input, mask, "");
384
385      if (dst_type.floating)
386         input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
387
388      rgba[chan] = input;
389   }
390}
391
392
393
394/**
395 * Fetch a texels from a texture, returning them in SoA layout.
396 *
397 * \param type  the desired return type for 'rgba'.  The vector length
398 *              is the number of texels to fetch
399 * \param aligned if the offset is guaranteed to be aligned to element width
400 *
401 * \param base_ptr  points to the base of the texture mip tree.
402 * \param offset    offset to start of the texture image block.  For non-
403 *                  compressed formats, this simply is an offset to the texel.
404 *                  For compressed formats, it is an offset to the start of the
405 *                  compressed data block.
406 *
407 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
408 *              these will always be (0,0).  For compressed formats, i will
409 *              be in [0, block_width-1] and j will be in [0, block_height-1].
410 * \param cache  optional value pointing to a lp_build_format_cache structure
411 */
412void
413lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
414                        const struct util_format_description *format_desc,
415                        struct lp_type type,
416                        boolean aligned,
417                        LLVMValueRef base_ptr,
418                        LLVMValueRef offset,
419                        LLVMValueRef i,
420                        LLVMValueRef j,
421                        LLVMValueRef cache,
422                        LLVMValueRef rgba_out[4])
423{
424   LLVMBuilderRef builder = gallivm->builder;
425   enum pipe_format format = format_desc->format;
426   struct lp_type fetch_type;
427
428   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
429       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
430        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
431        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
432       format_desc->block.width == 1 &&
433       format_desc->block.height == 1 &&
434       format_desc->block.bits <= type.width &&
435       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
436        format_desc->channel[0].size == 32 ||
437        format_desc->channel[0].size == 16))
438   {
439      /*
440       * The packed pixel fits into an element of the destination format. Put
441       * the packed pixels into a vector and extract each component for all
442       * vector elements in parallel.
443       */
444
445      LLVMValueRef packed;
446
447      /*
448       * gather the texels from the texture
449       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
450       */
451      assert(format_desc->block.bits <= type.width);
452      fetch_type = lp_type_uint(type.width);
453      packed = lp_build_gather(gallivm,
454                               type.length,
455                               format_desc->block.bits,
456                               fetch_type,
457                               aligned,
458                               base_ptr, offset, FALSE);
459
460      /*
461       * convert texels to float rgba
462       */
463      lp_build_unpack_rgba_soa(gallivm,
464                               format_desc,
465                               type,
466                               packed, rgba_out);
467      return;
468   }
469
470
471   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
472       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
473       format_desc->block.width == 1 &&
474       format_desc->block.height == 1 &&
475       format_desc->block.bits > type.width &&
476       ((format_desc->block.bits <= type.width * type.length &&
477         format_desc->channel[0].size <= type.width) ||
478        (format_desc->channel[0].size == 64 &&
479         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
480         type.floating)))
481   {
482      /*
483       * Similar to above, but the packed pixel is larger than what fits
484       * into an element of the destination format. The packed pixels will be
485       * shuffled into SoA vectors appropriately, and then the extraction will
486       * be done in parallel as much as possible.
487       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
488       * the gathered vectors can be shuffled easily (even with avx).
489       * 64xn float -> 32xn float is handled too but it's a bit special as
490       * it does the conversion pre-shuffle.
491       */
492
493      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
494      struct lp_type fetch_type, gather_type = type;
495      unsigned num_gather, fetch_width, i, j;
496      struct lp_build_context bld;
497      boolean fp64 = format_desc->channel[0].size == 64;
498
499      lp_build_context_init(&bld, gallivm, type);
500
501      assert(type.width == 32);
502      assert(format_desc->block.bits > type.width);
503
504      /*
505       * First, figure out fetch order.
506       */
507      fetch_width = util_next_power_of_two(format_desc->block.bits);
508      num_gather = fetch_width / type.width;
509      /*
510       * fp64 are treated like fp32 except we fetch twice wide values
511       * (as we shuffle after trunc). The shuffles for that work out
512       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
513       * albeit we miss the potential opportunity for hw gather (as it
514       * only handles native size).
515       */
516      num_gather = fetch_width / type.width;
517      gather_type.width *= num_gather;
518      if (fp64) {
519         num_gather /= 2;
520      }
521      gather_type.length /= num_gather;
522
523      for (i = 0; i < num_gather; i++) {
524         LLVMValueRef offsetr, shuf_vec;
525         if(num_gather == 4) {
526            for (j = 0; j < gather_type.length; j++) {
527               unsigned idx = i + 4*j;
528               shuffles[j] = lp_build_const_int32(gallivm, idx);
529            }
530            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
531            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
532
533         }
534         else if (num_gather == 2) {
535            assert(num_gather == 2);
536            for (j = 0; j < gather_type.length; j++) {
537               unsigned idx = i*2 + (j%2) + (j/2)*4;
538               shuffles[j] = lp_build_const_int32(gallivm, idx);
539            }
540            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
541            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
542         }
543         else {
544            assert(num_gather == 1);
545            offsetr = offset;
546         }
547         if (gather_type.length == 1) {
548            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
549            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
550         }
551
552         /*
553          * Determine whether to use float or int loads. This is mostly
554          * to outsmart the (stupid) llvm int/float shuffle logic, we
555          * don't really care much if the data is floats or ints...
556          * But llvm will refuse to use single float shuffle with int data
557          * and instead use 3 int shuffles instead, the code looks atrocious.
558          * (Note bitcasts often won't help, as llvm is too smart to be
559          * fooled by that.)
560          * Nobody cares about simd float<->int domain transition penalties,
561          * which usually don't even exist for shuffles anyway.
562          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
563          * going into transpose, which is unpacks, so doesn't really matter
564          * much).
565          * With 2x32bit or 4x16bit fetch, we use float vec, since those
566          * go into the weird channel separation shuffle. With floats,
567          * this is (with 128bit vectors):
568          * - 2 movq, 2 movhpd, 2 shufps
569          * With ints it would be:
570          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
571          * I've seen texture functions increase in code size by 15% just due
572          * to that (there's lots of such fetches in them...)
573          * (We could chose a different gather order to improve this somewhat
574          * for the int path, but it would basically just drop the blends,
575          * so the float path with this order really is optimal.)
576          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
577          * casts so must avoid them until we're done with the float shuffle...
578          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
579          * there's nothing we can do about them (we could overallocate by
580          * those couple bytes and use unaligned but pot sized load).
581          * Note that this is very much x86 specific. I don't know if this
582          * affect other archs at all.
583          */
584         if (num_gather > 1) {
585            /*
586             * We always want some float type here (with x86)
587             * due to shuffles being float ones afterwards (albeit for
588             * the num_gather == 4 case int should work fine too
589             * (unless there's some problems with avx but not avx2).
590             */
591            if (format_desc->channel[0].size == 64) {
592               fetch_type = lp_type_float_vec(64, gather_type.width);
593            } else {
594               fetch_type = lp_type_int_vec(32, gather_type.width);
595            }
596         }
597         else {
598            /* type doesn't matter much */
599            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
600                (format_desc->channel[0].size == 32 ||
601                 format_desc->channel[0].size == 64)) {
602            fetch_type = lp_type_float(gather_type.width);
603            } else {
604               fetch_type = lp_type_uint(gather_type.width);
605            }
606         }
607
608         /* Now finally gather the values */
609         packed[i] = lp_build_gather(gallivm, gather_type.length,
610                                     format_desc->block.bits,
611                                     fetch_type, aligned,
612                                     base_ptr, offsetr, FALSE);
613         if (fp64) {
614            struct lp_type conv_type = type;
615            conv_type.width *= 2;
616            packed[i] = LLVMBuildBitCast(builder, packed[i],
617                                         lp_build_vec_type(gallivm, conv_type), "");
618            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
619         }
620      }
621
622      /* shuffle the gathered values to SoA */
623      if (num_gather == 2) {
624         for (i = 0; i < num_gather; i++) {
625            for (j = 0; j < type.length; j++) {
626               unsigned idx = (j%2)*2 + (j/4)*4 + i;
627               if ((j/2)%2)
628                  idx += type.length;
629               shuffles[j] = lp_build_const_int32(gallivm, idx);
630            }
631            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
632                                            LLVMConstVector(shuffles, type.length), "");
633         }
634      }
635      else if (num_gather == 4) {
636         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
637      }
638      else {
639         assert(num_gather == 1);
640         dst[0] = packed[0];
641      }
642
643      /*
644       * And finally unpack exactly as above, except that
645       * chan shift is adjusted and the right vector selected.
646       */
647      if (!fp64) {
648         for (i = 0; i < num_gather; i++) {
649            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
650         }
651         for (i = 0; i < format_desc->nr_channels; i++) {
652            struct util_format_channel_description chan_desc = format_desc->channel[i];
653            unsigned blockbits = type.width;
654            unsigned vec_nr = chan_desc.shift / type.width;
655            chan_desc.shift %= type.width;
656
657            output[i] = lp_build_extract_soa_chan(&bld,
658                                                  blockbits,
659                                                  FALSE,
660                                                  chan_desc,
661                                                  dst[vec_nr]);
662         }
663      }
664      else {
665         for (i = 0; i < format_desc->nr_channels; i++)  {
666            output[i] = dst[i];
667         }
668      }
669
670      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
671      return;
672   }
673
674   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
675       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
676      /*
677       * similar conceptually to above but requiring special
678       * AoS packed -> SoA float conversion code.
679       */
680      LLVMValueRef packed;
681      struct lp_type fetch_type = lp_type_uint(type.width);
682
683      assert(type.floating);
684      assert(type.width == 32);
685
686      packed = lp_build_gather(gallivm, type.length,
687                               format_desc->block.bits,
688                               fetch_type, aligned,
689                               base_ptr, offset, FALSE);
690      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
691         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
692      }
693      else {
694         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
695      }
696      return;
697   }
698
699   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
700       format_desc->block.bits == 64) {
701      /*
702       * special case the format is 64 bits but we only require
703       * 32bit (or 8bit) from each block.
704       */
705      LLVMValueRef packed;
706      struct lp_type fetch_type = lp_type_uint(type.width);
707
708      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
709         /*
710          * for stencil simply fix up offsets - could in fact change
711          * base_ptr instead even outside the shader.
712          */
713         unsigned mask = (1 << 8) - 1;
714         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
715         offset = LLVMBuildAdd(builder, offset, s_offset, "");
716         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
717                                  aligned, base_ptr, offset, FALSE);
718         packed = LLVMBuildAnd(builder, packed,
719                               lp_build_const_int_vec(gallivm, type, mask), "");
720      }
721      else {
722         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
723         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
724                                  aligned, base_ptr, offset, TRUE);
725         packed = LLVMBuildBitCast(builder, packed,
726                                   lp_build_vec_type(gallivm, type), "");
727      }
728      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
729      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
730      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
731      return;
732   }
733
734   /*
735    * Try calling lp_build_fetch_rgba_aos for all pixels.
736    * Should only really hit subsampled, compressed
737    * (for s3tc srgb too, for rgtc the unorm ones only) by now.
738    * (This is invalid for plain 8unorm formats because we're lazy with
739    * the swizzle since some results would arrive swizzled, some not.)
740    */
741
742   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
743       (util_format_fits_8unorm(format_desc) ||
744        format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
745       type.floating && type.width == 32 &&
746       (type.length == 1 || (type.length % 4 == 0))) {
747      struct lp_type tmp_type;
748      struct lp_build_context bld;
749      LLVMValueRef packed, rgba[4];
750      const struct util_format_description *flinear_desc;
751      const struct util_format_description *frgba8_desc;
752      unsigned chan;
753
754      lp_build_context_init(&bld, gallivm, type);
755
756      /*
757       * Make sure the conversion in aos really only does convert to rgba8
758       * and not anything more (so use linear format, adjust type).
759       */
760      flinear_desc = util_format_description(util_format_linear(format));
761      memset(&tmp_type, 0, sizeof tmp_type);
762      tmp_type.width = 8;
763      tmp_type.length = type.length * 4;
764      tmp_type.norm = TRUE;
765
766      packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
767                                       aligned, base_ptr, offset, i, j, cache);
768      packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
769
770      /*
771       * The values are now packed so they match ordinary (srgb) RGBA8 format,
772       * hence need to use matching format for unpack.
773       */
774      frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
775      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
776         assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
777         frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
778      }
779      lp_build_unpack_rgba_soa(gallivm,
780                               frgba8_desc,
781                               type,
782                               packed, rgba);
783
784      /*
785       * We converted 4 channels. Make sure llvm can drop unneeded ones
786       * (luckily the rgba order is fixed, only LA needs special case).
787       */
788      for (chan = 0; chan < 4; chan++) {
789         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
790         if (chan == 3 && util_format_is_luminance_alpha(format)) {
791            swizzle = PIPE_SWIZZLE_W;
792         }
793         rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
794      }
795      return;
796   }
797
798
799   /*
800    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
801    *
802    * This is not the most efficient way of fetching pixels, as we
803    * miss some opportunities to do vectorization, but this is
804    * convenient for formats or scenarios for which there was no
805    * opportunity or incentive to optimize.
806    *
807    * We do NOT want to end up here, this typically is quite terrible,
808    * in particular if the formats have less than 4 channels.
809    *
810    * Right now, this should only be hit for:
811    * - RGTC snorm formats
812    *   (those miss fast fetch functions hence they are terrible anyway)
813    */
814
815   {
816      unsigned k;
817      struct lp_type tmp_type;
818      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
819
820      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
821         debug_printf("%s: AoS fetch fallback for %s\n",
822                      __FUNCTION__, format_desc->short_name);
823      }
824
825      tmp_type = type;
826      tmp_type.length = 4;
827
828      /*
829       * Note that vector transpose can be worse compared to insert/extract
830       * for aos->soa conversion (for formats with 1 or 2 channels). However,
831       * we should try to avoid getting here for just about all formats, so
832       * don't bother.
833       */
834
835      /* loop over number of pixels */
836      for(k = 0; k < type.length; ++k) {
837         LLVMValueRef index = lp_build_const_int32(gallivm, k);
838         LLVMValueRef offset_elem;
839         LLVMValueRef i_elem, j_elem;
840
841         offset_elem = LLVMBuildExtractElement(builder, offset,
842                                               index, "");
843
844         i_elem = LLVMBuildExtractElement(builder, i, index, "");
845         j_elem = LLVMBuildExtractElement(builder, j, index, "");
846
847         /* Get a single float[4]={R,G,B,A} pixel */
848         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
849                                                aligned, base_ptr, offset_elem,
850                                                i_elem, j_elem, cache);
851
852      }
853      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
854   }
855}
856