lp_bld_format_aos.c revision 4634cb5921b985f04f2daf00cda2d28036143bd3
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * AoS pixel format manipulation.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35
36#include "util/u_format.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include "util/u_pointer.h"
40#include "util/u_string.h"
41#include "util/u_cpu_detect.h"
42
43#include "lp_bld_arit.h"
44#include "lp_bld_init.h"
45#include "lp_bld_type.h"
46#include "lp_bld_flow.h"
47#include "lp_bld_const.h"
48#include "lp_bld_conv.h"
49#include "lp_bld_swizzle.h"
50#include "lp_bld_gather.h"
51#include "lp_bld_debug.h"
52#include "lp_bld_format.h"
53#include "lp_bld_pack.h"
54#include "lp_bld_intr.h"
55#include "lp_bld_logic.h"
56#include "lp_bld_bitarit.h"
57
58
59/**
60 * Basic swizzling.  Rearrange the order of the unswizzled array elements
61 * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
62 * too.
63 * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
64 */
65LLVMValueRef
66lp_build_format_swizzle_aos(const struct util_format_description *desc,
67                            struct lp_build_context *bld,
68                            LLVMValueRef unswizzled)
69{
70   unsigned char swizzles[4];
71   unsigned chan;
72
73   assert(bld->type.length % 4 == 0);
74
75   for (chan = 0; chan < 4; ++chan) {
76      enum pipe_swizzle swizzle;
77
78      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
79         /*
80          * For ZS formats do RGBA = ZZZ1
81          */
82         if (chan == 3) {
83            swizzle = PIPE_SWIZZLE_1;
84         } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
85            swizzle = PIPE_SWIZZLE_0;
86         } else {
87            swizzle = desc->swizzle[0];
88         }
89      } else {
90         swizzle = desc->swizzle[chan];
91      }
92      swizzles[chan] = swizzle;
93   }
94
95   return lp_build_swizzle_aos(bld, unswizzled, swizzles);
96}
97
98
99/**
100 * Whether the format matches the vector type, apart of swizzles.
101 */
102static inline boolean
103format_matches_type(const struct util_format_description *desc,
104                    struct lp_type type)
105{
106   enum util_format_type chan_type;
107   unsigned chan;
108
109   assert(type.length % 4 == 0);
110
111   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
112       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
113       desc->block.width != 1 ||
114       desc->block.height != 1) {
115      return FALSE;
116   }
117
118   if (type.floating) {
119      chan_type = UTIL_FORMAT_TYPE_FLOAT;
120   } else if (type.fixed) {
121      chan_type = UTIL_FORMAT_TYPE_FIXED;
122   } else if (type.sign) {
123      chan_type = UTIL_FORMAT_TYPE_SIGNED;
124   } else {
125      chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
126   }
127
128   for (chan = 0; chan < desc->nr_channels; ++chan) {
129      if (desc->channel[chan].size != type.width) {
130         return FALSE;
131      }
132
133      if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
134         if (desc->channel[chan].type != chan_type ||
135             desc->channel[chan].normalized != type.norm) {
136            return FALSE;
137         }
138      }
139   }
140
141   return TRUE;
142}
143
144/*
145 * Do rounding when converting small unorm values to larger ones.
146 * Not quite 100% accurate, as it's done by appending MSBs, but
147 * should be good enough.
148 */
149
150static inline LLVMValueRef
151scale_bits_up(struct gallivm_state *gallivm,
152              int src_bits,
153              int dst_bits,
154              LLVMValueRef src,
155              struct lp_type src_type)
156{
157   LLVMBuilderRef builder = gallivm->builder;
158   LLVMValueRef result = src;
159
160   if (src_bits == 1 && dst_bits > 1) {
161      /*
162       * Useful for a1 - we'd need quite some repeated copies otherwise.
163       */
164      struct lp_build_context bld;
165      LLVMValueRef dst_mask;
166      lp_build_context_init(&bld, gallivm, src_type);
167      dst_mask = lp_build_const_int_vec(gallivm, src_type,
168                                        (1 << dst_bits) - 1),
169      result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
170                            lp_build_const_int_vec(gallivm, src_type, 0));
171      result = lp_build_andnot(&bld, dst_mask, result);
172   }
173   else if (dst_bits > src_bits) {
174      /* Scale up bits */
175      int db = dst_bits - src_bits;
176
177      /* Shift left by difference in bits */
178      result = LLVMBuildShl(builder,
179                            src,
180                            lp_build_const_int_vec(gallivm, src_type, db),
181                            "");
182
183      if (db <= src_bits) {
184         /* Enough bits in src to fill the remainder */
185         LLVMValueRef lower = LLVMBuildLShr(builder,
186                                            src,
187                                            lp_build_const_int_vec(gallivm, src_type,
188                                                                   src_bits - db),
189                                            "");
190
191         result = LLVMBuildOr(builder, result, lower, "");
192      } else if (db > src_bits) {
193         /* Need to repeatedly copy src bits to fill remainder in dst */
194         unsigned n;
195
196         for (n = src_bits; n < dst_bits; n *= 2) {
197            LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
198
199            result = LLVMBuildOr(builder,
200                                 result,
201                                 LLVMBuildLShr(builder, result, shuv, ""),
202                                 "");
203         }
204      }
205   } else {
206      assert (dst_bits == src_bits);
207   }
208
209   return result;
210}
211
212/**
213 * Unpack a single pixel into its XYZW components.
214 *
215 * @param desc  the pixel format for the packed pixel value
216 * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
217 *
218 * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
219 */
220static inline LLVMValueRef
221lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
222                               const struct util_format_description *desc,
223                               LLVMValueRef packed)
224{
225   LLVMBuilderRef builder = gallivm->builder;
226   LLVMValueRef shifted, casted, scaled, masked;
227   LLVMValueRef shifts[4];
228   LLVMValueRef masks[4];
229   LLVMValueRef scales[4];
230   LLVMTypeRef vec32_type;
231
232   boolean normalized;
233   boolean needs_uitofp;
234   unsigned i;
235
236   /* TODO: Support more formats */
237   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
238   assert(desc->block.width == 1);
239   assert(desc->block.height == 1);
240   assert(desc->block.bits <= 32);
241
242   /* Do the intermediate integer computations with 32bit integers since it
243    * matches floating point size */
244   assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
245
246   vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
247
248   /* Broadcast the packed value to all four channels
249    * before: packed = BGRA
250    * after: packed = {BGRA, BGRA, BGRA, BGRA}
251    */
252   packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
253                                   LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
254                                   "");
255   packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
256                                   LLVMConstNull(vec32_type),
257                                   "");
258
259   /* Initialize vector constants */
260   normalized = FALSE;
261   needs_uitofp = FALSE;
262
263   /* Loop over 4 color components */
264   for (i = 0; i < 4; ++i) {
265      unsigned bits = desc->channel[i].size;
266      unsigned shift = desc->channel[i].shift;
267
268      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
269         shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
270         masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
271         scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
272      }
273      else {
274         unsigned long long mask = (1ULL << bits) - 1;
275
276         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
277
278         if (bits == 32) {
279            needs_uitofp = TRUE;
280         }
281
282         shifts[i] = lp_build_const_int32(gallivm, shift);
283         masks[i] = lp_build_const_int32(gallivm, mask);
284
285         if (desc->channel[i].normalized) {
286            scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
287            normalized = TRUE;
288         }
289         else
290            scales[i] =  lp_build_const_float(gallivm, 1.0);
291      }
292   }
293
294   /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
295    * into masked = {X, Y, Z, W}
296    */
297   if (desc->block.bits < 32 && normalized) {
298      /*
299       * Note: we cannot do the shift below on x86 natively until AVX2.
300       *
301       * Old llvm versions will resort to scalar extract/shift insert,
302       * which is definitely terrible, new versions will just do
303       * several vector shifts and shuffle/blend results together.
304       * We could turn this into a variable left shift plus a constant
305       * right shift, and llvm would then turn the variable left shift
306       * into a mul for us (albeit without sse41 the mul needs emulation
307       * too...). However, since we're going to do a float mul
308       * anyway, we just adjust that mul instead (plus the mask), skipping
309       * the shift completely.
310       * We could also use a extra mul when the format isn't normalized and
311       * we don't have AVX2 support, but don't bother for now. Unfortunately,
312       * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
313       * rgba8 if it ends up here), as that would require UIToFP, albeit that
314       * would be fixable with easy 16bit shuffle (unless there's channels
315       * crossing 16bit boundaries).
316       */
317      for (i = 0; i < 4; ++i) {
318         if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
319            unsigned bits = desc->channel[i].size;
320            unsigned shift = desc->channel[i].shift;
321            unsigned long long mask = ((1ULL << bits) - 1) << shift;
322            scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
323            masks[i] = lp_build_const_int32(gallivm, mask);
324         }
325      }
326      masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
327   } else {
328      shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
329      masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
330   }
331
332   if (!needs_uitofp) {
333      /* UIToFP can't be expressed in SSE2 */
334      casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
335   } else {
336      casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
337   }
338
339   /*
340    * At this point 'casted' may be a vector of floats such as
341    * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
342    * by powers of two). Next, if the pixel values are normalized
343    * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
344    */
345
346   if (normalized)
347      scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
348   else
349      scaled = casted;
350
351   return scaled;
352}
353
354
355/**
356 * Pack a single pixel.
357 *
358 * @param rgba 4 float vector with the unpacked components.
359 *
360 * XXX: This is mostly for reference and testing -- operating a single pixel at
361 * a time is rarely if ever needed.
362 */
363LLVMValueRef
364lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
365                       const struct util_format_description *desc,
366                       LLVMValueRef rgba)
367{
368   LLVMBuilderRef builder = gallivm->builder;
369   LLVMTypeRef type;
370   LLVMValueRef packed = NULL;
371   LLVMValueRef swizzles[4];
372   LLVMValueRef shifted, casted, scaled, unswizzled;
373   LLVMValueRef shifts[4];
374   LLVMValueRef scales[4];
375   boolean normalized;
376   unsigned i, j;
377
378   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
379   assert(desc->block.width == 1);
380   assert(desc->block.height == 1);
381
382   type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
383
384   /* Unswizzle the color components into the source vector. */
385   for (i = 0; i < 4; ++i) {
386      for (j = 0; j < 4; ++j) {
387         if (desc->swizzle[j] == i)
388            break;
389      }
390      if (j < 4)
391         swizzles[i] = lp_build_const_int32(gallivm, j);
392      else
393         swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
394   }
395
396   unswizzled = LLVMBuildShuffleVector(builder, rgba,
397                                       LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
398                                       LLVMConstVector(swizzles, 4), "");
399
400   normalized = FALSE;
401   for (i = 0; i < 4; ++i) {
402      unsigned bits = desc->channel[i].size;
403      unsigned shift = desc->channel[i].shift;
404
405      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
406         shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
407         scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
408      }
409      else {
410         unsigned mask = (1 << bits) - 1;
411
412         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
413         assert(bits < 32);
414
415         shifts[i] = lp_build_const_int32(gallivm, shift);
416
417         if (desc->channel[i].normalized) {
418            scales[i] = lp_build_const_float(gallivm, mask);
419            normalized = TRUE;
420         }
421         else
422            scales[i] = lp_build_const_float(gallivm, 1.0);
423      }
424   }
425
426   if (normalized)
427      scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
428   else
429      scaled = unswizzled;
430
431   casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
432
433   shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
434
435   /* Bitwise or all components */
436   for (i = 0; i < 4; ++i) {
437      if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
438         LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
439                                               lp_build_const_int32(gallivm, i), "");
440         if (packed)
441            packed = LLVMBuildOr(builder, packed, component, "");
442         else
443            packed = component;
444      }
445   }
446
447   if (!packed)
448      packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
449
450   if (desc->block.bits < 32)
451      packed = LLVMBuildTrunc(builder, packed, type, "");
452
453   return packed;
454}
455
456
457
458
459/**
460 * Fetch a pixel into a 4 float AoS.
461 *
462 * \param format_desc  describes format of the image we're fetching from
463 * \param aligned  whether the data is guaranteed to be aligned
464 * \param ptr  address of the pixel block (or the texel if uncompressed)
465 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
466 *              these will always be (0, 0).
467 * \return  a 4 element vector with the pixel's RGBA values.
468 */
469LLVMValueRef
470lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
471                        const struct util_format_description *format_desc,
472                        struct lp_type type,
473                        boolean aligned,
474                        LLVMValueRef base_ptr,
475                        LLVMValueRef offset,
476                        LLVMValueRef i,
477                        LLVMValueRef j,
478                        LLVMValueRef cache)
479{
480   LLVMBuilderRef builder = gallivm->builder;
481   unsigned num_pixels = type.length / 4;
482   struct lp_build_context bld;
483
484   assert(type.length <= LP_MAX_VECTOR_LENGTH);
485   assert(type.length % 4 == 0);
486
487   lp_build_context_init(&bld, gallivm, type);
488
489   /*
490    * Trivial case
491    *
492    * The format matches the type (apart of a swizzle) so no need for
493    * scaling or converting.
494    */
495
496   if (format_matches_type(format_desc, type) &&
497       format_desc->block.bits <= type.width * 4 &&
498       /* XXX this shouldn't be needed */
499       util_is_power_of_two(format_desc->block.bits)) {
500      LLVMValueRef packed;
501      LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
502      struct lp_type fetch_type;
503      unsigned vec_len = type.width * type.length;
504
505      /*
506       * The format matches the type (apart of a swizzle) so no need for
507       * scaling or converting.
508       */
509
510      fetch_type = lp_type_uint(type.width*4);
511      packed = lp_build_gather(gallivm, type.length/4,
512                               format_desc->block.bits, fetch_type,
513                               aligned, base_ptr, offset, TRUE);
514
515      assert(format_desc->block.bits <= vec_len);
516      (void) vec_len; /* silence unused var warning for non-debug build */
517
518      packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
519      return lp_build_format_swizzle_aos(format_desc, &bld, packed);
520   }
521
522   /*
523    * Bit arithmetic for converting small_unorm to unorm8.
524    *
525    * This misses some opportunities for optimizations (like skipping mask
526    * for the highest channel for instance, or doing bit scaling in parallel
527    * for channels with the same bit width) but it should be passable for
528    * all arithmetic formats.
529    */
530   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
531       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
532       util_format_fits_8unorm(format_desc) &&
533       type.width == 8 && type.norm == 1 && type.sign == 0 &&
534       type.fixed == 0 && type.floating == 0) {
535      LLVMValueRef packed, res, chans[4], rgba[4];
536      LLVMTypeRef dst_vec_type, conv_vec_type;
537      struct lp_type fetch_type, conv_type;
538      struct lp_build_context bld_conv;
539      unsigned j;
540
541      fetch_type = lp_type_uint(type.width*4);
542      conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
543      dst_vec_type = lp_build_vec_type(gallivm, type);
544      conv_vec_type = lp_build_vec_type(gallivm, conv_type);
545      lp_build_context_init(&bld, gallivm, conv_type);
546
547      packed = lp_build_gather(gallivm, type.length/4,
548                               format_desc->block.bits, fetch_type,
549                               aligned, base_ptr, offset, TRUE);
550
551      assert(format_desc->block.bits * type.length / 4 <=
552             type.width * type.length);
553
554      packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
555
556      for (j = 0; j < format_desc->nr_channels; ++j) {
557         unsigned mask = 0;
558         unsigned sa = format_desc->channel[j].shift;
559
560         mask = (1 << format_desc->channel[j].size) - 1;
561
562         /* Extract bits from source */
563         chans[j] = LLVMBuildLShr(builder, packed,
564                                  lp_build_const_int_vec(gallivm, conv_type, sa),
565                                  "");
566
567         chans[j] = LLVMBuildAnd(builder, chans[j],
568                                 lp_build_const_int_vec(gallivm, conv_type, mask),
569                                 "");
570
571         /* Scale bits */
572         if (type.norm) {
573            chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
574                                     type.width, chans[j], conv_type);
575         }
576      }
577      /*
578       * This is a hacked lp_build_format_swizzle_soa() since we need a
579       * normalized 1 but only 8 bits in a 32bit vector...
580       */
581      for (j = 0; j < 4; ++j) {
582         enum pipe_swizzle swizzle = format_desc->swizzle[j];
583         if (swizzle == PIPE_SWIZZLE_1) {
584            rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
585         } else {
586            rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
587         }
588         if (j == 0) {
589            res = rgba[j];
590         } else {
591            rgba[j] = LLVMBuildShl(builder, rgba[j],
592                                   lp_build_const_int_vec(gallivm, conv_type,
593                                                          j * type.width), "");
594            res = LLVMBuildOr(builder, res, rgba[j], "");
595         }
596      }
597      res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
598
599      return res;
600   }
601
602   /*
603    * Bit arithmetic
604    */
605
606   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
607       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
608        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
609       format_desc->block.width == 1 &&
610       format_desc->block.height == 1 &&
611       /* XXX this shouldn't be needed */
612       util_is_power_of_two(format_desc->block.bits) &&
613       format_desc->block.bits <= 32 &&
614       format_desc->is_bitmask &&
615       !format_desc->is_mixed &&
616       (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
617        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
618       !format_desc->channel[0].pure_integer) {
619
620      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
621      LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
622      struct lp_type conv_type;
623      unsigned k, num_conv_src, num_conv_dst;
624
625      /*
626       * Note this path is generally terrible for fetching multiple pixels.
627       * We should make sure we cannot hit this code path for anything but
628       * single pixels.
629       */
630
631      /*
632       * Unpack a pixel at a time into a <4 x float> RGBA vector
633       */
634
635      for (k = 0; k < num_pixels; ++k) {
636         LLVMValueRef packed;
637
638         packed = lp_build_gather_elem(gallivm, num_pixels,
639                                       format_desc->block.bits, 32, aligned,
640                                       base_ptr, offset, k, FALSE);
641
642         tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
643                                                  format_desc,
644                                                  packed);
645      }
646
647      /*
648       * Type conversion.
649       *
650       * TODO: We could avoid floating conversion for integer to
651       * integer conversions.
652       */
653
654      if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
655         debug_printf("%s: unpacking %s with floating point\n",
656                      __FUNCTION__, format_desc->short_name);
657      }
658
659      conv_type = lp_float32_vec4_type();
660      num_conv_src = num_pixels;
661      num_conv_dst = 1;
662
663      if (num_pixels % 8 == 0) {
664         lp_build_concat_n(gallivm, lp_float32_vec4_type(),
665                           tmps, num_pixels, tmps, num_pixels / 2);
666         conv_type.length *= num_pixels / 4;
667         num_conv_src = 4 * num_pixels / 8;
668         if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
669            /*
670             * FIXME: The fast float->unorm path (which is basically
671             * skipping the MIN/MAX which are extremely pointless in any
672             * case) requires that there's 2 destinations...
673             * In any case, we really should make sure we don't hit this
674             * code with multiple pixels for unorm8 dst types, it's
675             * completely hopeless even if we do hit the right conversion.
676             */
677            type.length /= num_pixels / 4;
678            num_conv_dst = num_pixels / 4;
679         }
680      }
681
682      lp_build_conv(gallivm, conv_type, type,
683                    tmps, num_conv_src, res, num_conv_dst);
684
685      if (num_pixels % 8 == 0 &&
686          (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
687         lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
688      }
689
690      return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
691   }
692
693   /* If all channels are of same type and we are not using half-floats */
694   if (format_desc->is_array &&
695       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
696      assert(!format_desc->is_mixed);
697      return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
698   }
699
700   /*
701    * YUV / subsampled formats
702    */
703
704   if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
705      struct lp_type tmp_type;
706      LLVMValueRef tmp;
707
708      memset(&tmp_type, 0, sizeof tmp_type);
709      tmp_type.width = 8;
710      tmp_type.length = num_pixels * 4;
711      tmp_type.norm = TRUE;
712
713      tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
714                                               format_desc,
715                                               num_pixels,
716                                               base_ptr,
717                                               offset,
718                                               i, j);
719
720      lp_build_conv(gallivm,
721                    tmp_type, type,
722                    &tmp, 1, &tmp, 1);
723
724      return tmp;
725   }
726
727   /*
728    * s3tc rgb formats
729    */
730
731   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
732      struct lp_type tmp_type;
733      LLVMValueRef tmp;
734
735      memset(&tmp_type, 0, sizeof tmp_type);
736      tmp_type.width = 8;
737      tmp_type.length = num_pixels * 4;
738      tmp_type.norm = TRUE;
739
740      tmp = lp_build_fetch_cached_texels(gallivm,
741                                         format_desc,
742                                         num_pixels,
743                                         base_ptr,
744                                         offset,
745                                         i, j,
746                                         cache);
747
748      lp_build_conv(gallivm,
749                    tmp_type, type,
750                    &tmp, 1, &tmp, 1);
751
752       return tmp;
753   }
754
755   /*
756    * Fallback to util_format_description::fetch_rgba_8unorm().
757    */
758
759   if (format_desc->fetch_rgba_8unorm &&
760       !type.floating && type.width == 8 && !type.sign && type.norm) {
761      /*
762       * Fallback to calling util_format_description::fetch_rgba_8unorm.
763       *
764       * This is definitely not the most efficient way of fetching pixels, as
765       * we miss the opportunity to do vectorization, but this it is a
766       * convenient for formats or scenarios for which there was no opportunity
767       * or incentive to optimize.
768       */
769
770      LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
771      LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
772      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
773      LLVMValueRef function;
774      LLVMValueRef tmp_ptr;
775      LLVMValueRef tmp;
776      LLVMValueRef res;
777      unsigned k;
778
779      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
780         debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
781                      __FUNCTION__, format_desc->short_name);
782      }
783
784      /*
785       * Declare and bind format_desc->fetch_rgba_8unorm().
786       */
787
788      {
789         /*
790          * Function to call looks like:
791          *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
792          */
793         LLVMTypeRef ret_type;
794         LLVMTypeRef arg_types[4];
795         LLVMTypeRef function_type;
796
797         ret_type = LLVMVoidTypeInContext(gallivm->context);
798         arg_types[0] = pi8t;
799         arg_types[1] = pi8t;
800         arg_types[2] = i32t;
801         arg_types[3] = i32t;
802         function_type = LLVMFunctionType(ret_type, arg_types,
803                                          ARRAY_SIZE(arg_types), 0);
804
805         /* make const pointer for the C fetch_rgba_8unorm function */
806         function = lp_build_const_int_pointer(gallivm,
807            func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
808
809         /* cast the callee pointer to the function's type */
810         function = LLVMBuildBitCast(builder, function,
811                                     LLVMPointerType(function_type, 0),
812                                     "cast callee");
813      }
814
815      tmp_ptr = lp_build_alloca(gallivm, i32t, "");
816
817      res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
818
819      /*
820       * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
821       * in the SoA vectors.
822       */
823
824      for (k = 0; k < num_pixels; ++k) {
825         LLVMValueRef index = lp_build_const_int32(gallivm, k);
826         LLVMValueRef args[4];
827
828         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
829         args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
830                                            base_ptr, offset, k);
831
832         if (num_pixels == 1) {
833            args[2] = i;
834            args[3] = j;
835         }
836         else {
837            args[2] = LLVMBuildExtractElement(builder, i, index, "");
838            args[3] = LLVMBuildExtractElement(builder, j, index, "");
839         }
840
841         LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
842
843         tmp = LLVMBuildLoad(builder, tmp_ptr, "");
844
845         if (num_pixels == 1) {
846            res = tmp;
847         }
848         else {
849            res = LLVMBuildInsertElement(builder, res, tmp, index, "");
850         }
851      }
852
853      /* Bitcast from <n x i32> to <4n x i8> */
854      res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
855
856      return res;
857   }
858
859   /*
860    * Fallback to util_format_description::fetch_rgba_float().
861    */
862
863   if (format_desc->fetch_rgba_float) {
864      /*
865       * Fallback to calling util_format_description::fetch_rgba_float.
866       *
867       * This is definitely not the most efficient way of fetching pixels, as
868       * we miss the opportunity to do vectorization, but this it is a
869       * convenient for formats or scenarios for which there was no opportunity
870       * or incentive to optimize.
871       */
872
873      LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
874      LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
875      LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
876      LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
877      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
878      LLVMValueRef function;
879      LLVMValueRef tmp_ptr;
880      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
881      LLVMValueRef res;
882      unsigned k;
883
884      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
885         debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
886                      __FUNCTION__, format_desc->short_name);
887      }
888
889      /*
890       * Declare and bind format_desc->fetch_rgba_float().
891       */
892
893      {
894         /*
895          * Function to call looks like:
896          *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
897          */
898         LLVMTypeRef ret_type;
899         LLVMTypeRef arg_types[4];
900
901         ret_type = LLVMVoidTypeInContext(gallivm->context);
902         arg_types[0] = pf32t;
903         arg_types[1] = pi8t;
904         arg_types[2] = i32t;
905         arg_types[3] = i32t;
906
907         function = lp_build_const_func_pointer(gallivm,
908                                                func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
909                                                ret_type,
910                                                arg_types, ARRAY_SIZE(arg_types),
911                                                format_desc->short_name);
912      }
913
914      tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
915
916      /*
917       * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
918       * in the SoA vectors.
919       */
920
921      for (k = 0; k < num_pixels; ++k) {
922         LLVMValueRef args[4];
923
924         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
925         args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
926                                            base_ptr, offset, k);
927
928         if (num_pixels == 1) {
929            args[2] = i;
930            args[3] = j;
931         }
932         else {
933            LLVMValueRef index = lp_build_const_int32(gallivm, k);
934            args[2] = LLVMBuildExtractElement(builder, i, index, "");
935            args[3] = LLVMBuildExtractElement(builder, j, index, "");
936         }
937
938         LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
939
940         tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
941      }
942
943      lp_build_conv(gallivm,
944                    lp_float32_vec4_type(),
945                    type,
946                    tmps, num_pixels, &res, 1);
947
948      return res;
949   }
950
951   assert(!util_format_is_pure_integer(format_desc->format));
952
953   assert(0);
954   return lp_build_undef(gallivm, type);
955}
956