lp_bld_depth.c revision ffe2a1ca3c097661dd3f6e3ca5cfd72be184426c
1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * Depth/stencil testing to LLVM IR translation.
31 *
32 * To be done accurately/efficiently the depth/stencil test must be done with
33 * the same type/format of the depth/stencil buffer, which implies massaging
34 * the incoming depths to fit into place. Using a more straightforward
35 * type/format for depth/stencil values internally and only convert when
36 * flushing would avoid this, but it would most likely result in depth fighting
37 * artifacts.
38 *
39 * Since we're using linear layout for everything, but we need to deal with
40 * 2x2 quads, we need to load/store multiple values and swizzle them into
41 * place (we could avoid this by doing depth/stencil testing in linear format,
42 * which would be easy for late depth/stencil test as we could do that after
43 * the fragment shader loop just as we do for color buffers, but more tricky
44 * for early depth test as we'd need both masks and interpolated depth in
45 * linear format).
46 *
47 *
48 * @author Jose Fonseca <jfonseca@vmware.com>
49 * @author Brian Paul <jfonseca@vmware.com>
50 */
51
52#include "pipe/p_state.h"
53#include "util/u_format.h"
54#include "util/u_cpu_detect.h"
55
56#include "gallivm/lp_bld_type.h"
57#include "gallivm/lp_bld_arit.h"
58#include "gallivm/lp_bld_bitarit.h"
59#include "gallivm/lp_bld_const.h"
60#include "gallivm/lp_bld_conv.h"
61#include "gallivm/lp_bld_logic.h"
62#include "gallivm/lp_bld_flow.h"
63#include "gallivm/lp_bld_intr.h"
64#include "gallivm/lp_bld_debug.h"
65#include "gallivm/lp_bld_swizzle.h"
66#include "gallivm/lp_bld_pack.h"
67
68#include "lp_bld_depth.h"
69
70
71/** Used to select fields from pipe_stencil_state */
72enum stencil_op {
73   S_FAIL_OP,
74   Z_FAIL_OP,
75   Z_PASS_OP
76};
77
78
79
80/**
81 * Do the stencil test comparison (compare FB stencil values against ref value).
82 * This will be used twice when generating two-sided stencil code.
83 * \param stencil  the front/back stencil state
84 * \param stencilRef  the stencil reference value, replicated as a vector
85 * \param stencilVals  vector of stencil values from framebuffer
86 * \return vector mask of pass/fail values (~0 or 0)
87 */
88static LLVMValueRef
89lp_build_stencil_test_single(struct lp_build_context *bld,
90                             const struct pipe_stencil_state *stencil,
91                             LLVMValueRef stencilRef,
92                             LLVMValueRef stencilVals)
93{
94   LLVMBuilderRef builder = bld->gallivm->builder;
95   const unsigned stencilMax = 255; /* XXX fix */
96   struct lp_type type = bld->type;
97   LLVMValueRef res;
98
99   /*
100    * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
101    * are between 0..255 so ensure we generate the fastest comparisons for
102    * wider elements.
103    */
104   if (type.width <= 8) {
105      assert(!type.sign);
106   } else {
107      assert(type.sign);
108   }
109
110   assert(stencil->enabled);
111
112   if (stencil->valuemask != stencilMax) {
113      /* compute stencilRef = stencilRef & valuemask */
114      LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);
115      stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");
116      /* compute stencilVals = stencilVals & valuemask */
117      stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");
118   }
119
120   res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
121
122   return res;
123}
124
125
126/**
127 * Do the one or two-sided stencil test comparison.
128 * \sa lp_build_stencil_test_single
129 * \param front_facing  an integer vector mask, indicating front (~0) or back
130 *                      (0) facing polygon. If NULL, assume front-facing.
131 */
132static LLVMValueRef
133lp_build_stencil_test(struct lp_build_context *bld,
134                      const struct pipe_stencil_state stencil[2],
135                      LLVMValueRef stencilRefs[2],
136                      LLVMValueRef stencilVals,
137                      LLVMValueRef front_facing)
138{
139   LLVMValueRef res;
140
141   assert(stencil[0].enabled);
142
143   /* do front face test */
144   res = lp_build_stencil_test_single(bld, &stencil[0],
145                                      stencilRefs[0], stencilVals);
146
147   if (stencil[1].enabled && front_facing != NULL) {
148      /* do back face test */
149      LLVMValueRef back_res;
150
151      back_res = lp_build_stencil_test_single(bld, &stencil[1],
152                                              stencilRefs[1], stencilVals);
153
154      res = lp_build_select(bld, front_facing, res, back_res);
155   }
156
157   return res;
158}
159
160
161/**
162 * Apply the stencil operator (add/sub/keep/etc) to the given vector
163 * of stencil values.
164 * \return  new stencil values vector
165 */
166static LLVMValueRef
167lp_build_stencil_op_single(struct lp_build_context *bld,
168                           const struct pipe_stencil_state *stencil,
169                           enum stencil_op op,
170                           LLVMValueRef stencilRef,
171                           LLVMValueRef stencilVals)
172
173{
174   LLVMBuilderRef builder = bld->gallivm->builder;
175   struct lp_type type = bld->type;
176   LLVMValueRef res;
177   LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);
178   unsigned stencil_op;
179
180   assert(type.sign);
181
182   switch (op) {
183   case S_FAIL_OP:
184      stencil_op = stencil->fail_op;
185      break;
186   case Z_FAIL_OP:
187      stencil_op = stencil->zfail_op;
188      break;
189   case Z_PASS_OP:
190      stencil_op = stencil->zpass_op;
191      break;
192   default:
193      assert(0 && "Invalid stencil_op mode");
194      stencil_op = PIPE_STENCIL_OP_KEEP;
195   }
196
197   switch (stencil_op) {
198   case PIPE_STENCIL_OP_KEEP:
199      res = stencilVals;
200      /* we can return early for this case */
201      return res;
202   case PIPE_STENCIL_OP_ZERO:
203      res = bld->zero;
204      break;
205   case PIPE_STENCIL_OP_REPLACE:
206      res = stencilRef;
207      break;
208   case PIPE_STENCIL_OP_INCR:
209      res = lp_build_add(bld, stencilVals, bld->one);
210      res = lp_build_min(bld, res, max);
211      break;
212   case PIPE_STENCIL_OP_DECR:
213      res = lp_build_sub(bld, stencilVals, bld->one);
214      res = lp_build_max(bld, res, bld->zero);
215      break;
216   case PIPE_STENCIL_OP_INCR_WRAP:
217      res = lp_build_add(bld, stencilVals, bld->one);
218      res = LLVMBuildAnd(builder, res, max, "");
219      break;
220   case PIPE_STENCIL_OP_DECR_WRAP:
221      res = lp_build_sub(bld, stencilVals, bld->one);
222      res = LLVMBuildAnd(builder, res, max, "");
223      break;
224   case PIPE_STENCIL_OP_INVERT:
225      res = LLVMBuildNot(builder, stencilVals, "");
226      res = LLVMBuildAnd(builder, res, max, "");
227      break;
228   default:
229      assert(0 && "bad stencil op mode");
230      res = bld->undef;
231   }
232
233   return res;
234}
235
236
237/**
238 * Do the one or two-sided stencil test op/update.
239 */
240static LLVMValueRef
241lp_build_stencil_op(struct lp_build_context *bld,
242                    const struct pipe_stencil_state stencil[2],
243                    enum stencil_op op,
244                    LLVMValueRef stencilRefs[2],
245                    LLVMValueRef stencilVals,
246                    LLVMValueRef mask,
247                    LLVMValueRef front_facing)
248
249{
250   LLVMBuilderRef builder = bld->gallivm->builder;
251   LLVMValueRef res;
252
253   assert(stencil[0].enabled);
254
255   /* do front face op */
256   res = lp_build_stencil_op_single(bld, &stencil[0], op,
257                                     stencilRefs[0], stencilVals);
258
259   if (stencil[1].enabled && front_facing != NULL) {
260      /* do back face op */
261      LLVMValueRef back_res;
262
263      back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
264                                            stencilRefs[1], stencilVals);
265
266      res = lp_build_select(bld, front_facing, res, back_res);
267   }
268
269   if (stencil[0].writemask != 0xff ||
270       (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {
271      /* mask &= stencil[0].writemask */
272      LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
273                                                      stencil[0].writemask);
274      if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {
275         LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
276                                                         stencil[1].writemask);
277         writemask = lp_build_select(bld, front_facing, writemask, back_writemask);
278      }
279
280      mask = LLVMBuildAnd(builder, mask, writemask, "");
281      /* res = (res & mask) | (stencilVals & ~mask) */
282      res = lp_build_select_bitwise(bld, mask, res, stencilVals);
283   }
284   else {
285      /* res = mask ? res : stencilVals */
286      res = lp_build_select(bld, mask, res, stencilVals);
287   }
288
289   return res;
290}
291
292
293
294/**
295 * Return a type that matches the depth/stencil format.
296 */
297struct lp_type
298lp_depth_type(const struct util_format_description *format_desc,
299              unsigned length)
300{
301   struct lp_type type;
302   unsigned z_swizzle;
303
304   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
305   assert(format_desc->block.width == 1);
306   assert(format_desc->block.height == 1);
307
308   memset(&type, 0, sizeof type);
309   type.width = format_desc->block.bits;
310
311   z_swizzle = format_desc->swizzle[0];
312   if (z_swizzle < 4) {
313      if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
314         type.floating = TRUE;
315         assert(z_swizzle == 0);
316         assert(format_desc->channel[z_swizzle].size == 32);
317      }
318      else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
319         assert(format_desc->block.bits <= 32);
320         assert(format_desc->channel[z_swizzle].normalized);
321         if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {
322            /* Prefer signed integers when possible, as SSE has less support
323             * for unsigned comparison;
324             */
325            type.sign = TRUE;
326         }
327      }
328      else
329         assert(0);
330   }
331
332   type.length = length;
333
334   return type;
335}
336
337
338/**
339 * Compute bitmask and bit shift to apply to the incoming fragment Z values
340 * and the Z buffer values needed before doing the Z comparison.
341 *
342 * Note that we leave the Z bits in the position that we find them
343 * in the Z buffer (typically 0xffffff00 or 0x00ffffff).  That lets us
344 * get by with fewer bit twiddling steps.
345 */
346static boolean
347get_z_shift_and_mask(const struct util_format_description *format_desc,
348                     unsigned *shift, unsigned *width, unsigned *mask)
349{
350   unsigned total_bits;
351   unsigned z_swizzle;
352   unsigned chan;
353   unsigned padding_left, padding_right;
354
355   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
356   assert(format_desc->block.width == 1);
357   assert(format_desc->block.height == 1);
358
359   /* 64bit d/s format is special already extracted 32 bits */
360   total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
361
362   z_swizzle = format_desc->swizzle[0];
363
364   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
365      return FALSE;
366
367   *width = format_desc->channel[z_swizzle].size;
368
369   padding_right = 0;
370   for (chan = 0; chan < z_swizzle; ++chan)
371      padding_right += format_desc->channel[chan].size;
372
373   padding_left =
374      total_bits - (padding_right + *width);
375
376   if (padding_left || padding_right) {
377      unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
378      unsigned long long mask_right = (1ULL << (padding_right)) - 1;
379      *mask = mask_left ^ mask_right;
380   }
381   else {
382      *mask = 0xffffffff;
383   }
384
385   *shift = padding_right;
386
387   return TRUE;
388}
389
390
391/**
392 * Compute bitmask and bit shift to apply to the framebuffer pixel values
393 * to put the stencil bits in the least significant position.
394 * (i.e. 0x000000ff)
395 */
396static boolean
397get_s_shift_and_mask(const struct util_format_description *format_desc,
398                     unsigned *shift, unsigned *mask)
399{
400   unsigned s_swizzle;
401   unsigned chan, sz;
402
403   s_swizzle = format_desc->swizzle[1];
404
405   if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
406      return FALSE;
407
408   /* just special case 64bit d/s format */
409   if (format_desc->block.bits > 32) {
410      assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
411      *shift = 0;
412      *mask = 0xff;
413      return TRUE;
414   }
415
416   *shift = 0;
417   for (chan = 0; chan < s_swizzle; chan++)
418      *shift += format_desc->channel[chan].size;
419
420   sz = format_desc->channel[s_swizzle].size;
421   *mask = (1U << sz) - 1U;
422
423   return TRUE;
424}
425
426
427/**
428 * Perform the occlusion test and increase the counter.
429 * Test the depth mask. Add the number of channel which has none zero mask
430 * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
431 * The counter will add 4.
432 *
433 * \param type holds element type of the mask vector.
434 * \param maskvalue is the depth test mask.
435 * \param counter is a pointer of the uint32 counter.
436 */
437void
438lp_build_occlusion_count(struct gallivm_state *gallivm,
439                         struct lp_type type,
440                         LLVMValueRef maskvalue,
441                         LLVMValueRef counter)
442{
443   LLVMBuilderRef builder = gallivm->builder;
444   LLVMContextRef context = gallivm->context;
445   LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
446   LLVMValueRef count, newcount;
447
448   assert(type.length <= 16);
449   assert(type.floating);
450
451   if(util_cpu_caps.has_sse && type.length == 4) {
452      const char *movmskintr = "llvm.x86.sse.movmsk.ps";
453      const char *popcntintr = "llvm.ctpop.i32";
454      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
455                                           lp_build_vec_type(gallivm, type), "");
456      bits = lp_build_intrinsic_unary(builder, movmskintr,
457                                      LLVMInt32TypeInContext(context), bits);
458      count = lp_build_intrinsic_unary(builder, popcntintr,
459                                       LLVMInt32TypeInContext(context), bits);
460   }
461   else if(util_cpu_caps.has_avx && type.length == 8) {
462      const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
463      const char *popcntintr = "llvm.ctpop.i32";
464      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
465                                           lp_build_vec_type(gallivm, type), "");
466      bits = lp_build_intrinsic_unary(builder, movmskintr,
467                                      LLVMInt32TypeInContext(context), bits);
468      count = lp_build_intrinsic_unary(builder, popcntintr,
469                                       LLVMInt32TypeInContext(context), bits);
470   }
471   else {
472      unsigned i;
473      LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
474      LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
475      LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
476      LLVMValueRef shufflev, countd;
477      LLVMValueRef shuffles[16];
478      const char *popcntintr = NULL;
479
480      countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
481
482       for (i = 0; i < type.length; i++) {
483          shuffles[i] = lp_build_const_int32(gallivm, 4*i);
484       }
485
486       shufflev = LLVMConstVector(shuffles, type.length);
487       countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
488       countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
489
490       /*
491        * XXX FIXME
492        * this is bad on cpus without popcount (on x86 supported by intel
493        * nehalem, amd barcelona, and up - not tied to sse42).
494        * Would be much faster to just sum the 4 elements of the vector with
495        * some horizontal add (shuffle/add/shuffle/add after the initial and).
496        */
497       switch (type.length) {
498       case 4:
499          popcntintr = "llvm.ctpop.i32";
500          break;
501       case 8:
502          popcntintr = "llvm.ctpop.i64";
503          break;
504       case 16:
505          popcntintr = "llvm.ctpop.i128";
506          break;
507       default:
508          assert(0);
509       }
510       count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
511
512       if (type.length > 4) {
513          count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 32), "");
514       }
515   }
516   newcount = LLVMBuildLoad(builder, counter, "origcount");
517   newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
518   LLVMBuildStore(builder, newcount, counter);
519}
520
521
522/**
523 * Load depth/stencil values.
524 * The stored values are linear, swizzle them.
525 *
526 * \param type  the data type of the fragment depth/stencil values
527 * \param format_desc  description of the depth/stencil surface
528 * \param is_1d  whether this resource has only one dimension
529 * \param loop_counter  the current loop iteration
530 * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
531 * \param depth_stride  stride of the depth/stencil buffer
532 * \param z_fb  contains z values loaded from fb (may include padding)
533 * \param s_fb  contains s values loaded from fb (may include padding)
534 */
535void
536lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
537                                     struct lp_type z_src_type,
538                                     const struct util_format_description *format_desc,
539                                     boolean is_1d,
540                                     LLVMValueRef depth_ptr,
541                                     LLVMValueRef depth_stride,
542                                     LLVMValueRef *z_fb,
543                                     LLVMValueRef *s_fb,
544                                     LLVMValueRef loop_counter)
545{
546   LLVMBuilderRef builder = gallivm->builder;
547   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
548   LLVMValueRef zs_dst1, zs_dst2;
549   LLVMValueRef zs_dst_ptr;
550   LLVMValueRef depth_offset1, depth_offset2;
551   LLVMTypeRef load_ptr_type;
552   unsigned depth_bytes = format_desc->block.bits / 8;
553   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
554   struct lp_type zs_load_type = zs_type;
555
556   zs_load_type.length = zs_load_type.length / 2;
557   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
558
559   if (z_src_type.length == 4) {
560      unsigned i;
561      LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
562                                          lp_build_const_int32(gallivm, 1), "");
563      LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
564                                          lp_build_const_int32(gallivm, 2), "");
565      LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
566                                          depth_stride, "");
567      depth_offset1 = LLVMBuildMul(builder, looplsb,
568                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
569      depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
570
571      /* just concatenate the loaded 2x2 values into 4-wide vector */
572      for (i = 0; i < 4; i++) {
573         shuffles[i] = lp_build_const_int32(gallivm, i);
574      }
575   }
576   else {
577      unsigned i;
578      LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
579                                         lp_build_const_int32(gallivm, 1), "");
580      assert(z_src_type.length == 8);
581      depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
582      /*
583       * We load 2x4 values, and need to swizzle them (order
584       * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
585       */
586      for (i = 0; i < 8; i++) {
587         shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
588      }
589   }
590
591   depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
592
593   /* Load current z/stencil values from z/stencil buffer */
594   zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
595   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
596   zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
597   if (is_1d) {
598      zs_dst2 = lp_build_undef(gallivm, zs_load_type);
599   }
600   else {
601      zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
602      zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
603      zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
604   }
605
606   *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
607                                  LLVMConstVector(shuffles, zs_type.length), "");
608   *s_fb = *z_fb;
609
610   if (format_desc->block.bits < z_src_type.width) {
611      /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
612      *z_fb = LLVMBuildZExt(builder, *z_fb,
613                            lp_build_int_vec_type(gallivm, z_src_type), "");
614   }
615
616   else if (format_desc->block.bits > 32) {
617      /* rely on llvm to handle too wide vector we have here nicely */
618      unsigned i;
619      struct lp_type typex2 = zs_type;
620      struct lp_type s_type = zs_type;
621      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
622      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
623      LLVMValueRef tmp;
624
625      typex2.width = typex2.width / 2;
626      typex2.length = typex2.length * 2;
627      s_type.width = s_type.width / 2;
628      s_type.floating = 0;
629
630      tmp = LLVMBuildBitCast(builder, *z_fb,
631                             lp_build_vec_type(gallivm, typex2), "");
632
633      for (i = 0; i < zs_type.length; i++) {
634         shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
635         shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
636      }
637      *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
638                                     LLVMConstVector(shuffles1, zs_type.length), "");
639      *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
640                                     LLVMConstVector(shuffles2, zs_type.length), "");
641      *s_fb = LLVMBuildBitCast(builder, *s_fb,
642                               lp_build_vec_type(gallivm, s_type), "");
643      lp_build_name(*s_fb, "s_dst");
644   }
645
646   lp_build_name(*z_fb, "z_dst");
647   lp_build_name(*s_fb, "s_dst");
648   lp_build_name(*z_fb, "z_dst");
649}
650
651/**
652 * Store depth/stencil values.
653 * Incoming values are swizzled (typically n 2x2 quads), stored linear.
654 * If there's a mask it will do select/store otherwise just store.
655 *
656 * \param type  the data type of the fragment depth/stencil values
657 * \param format_desc  description of the depth/stencil surface
658 * \param is_1d  whether this resource has only one dimension
659 * \param mask  the alive/dead pixel mask for the quad (vector)
660 * \param z_fb  z values read from fb (with padding)
661 * \param s_fb  s values read from fb (with padding)
662 * \param loop_counter  the current loop iteration
663 * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
664 * \param depth_stride  stride of the depth/stencil buffer
665 * \param z_value the depth values to store (with padding)
666 * \param s_value the stencil values to store (with padding)
667 */
668void
669lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
670                                      struct lp_type z_src_type,
671                                      const struct util_format_description *format_desc,
672                                      boolean is_1d,
673                                      struct lp_build_mask_context *mask,
674                                      LLVMValueRef z_fb,
675                                      LLVMValueRef s_fb,
676                                      LLVMValueRef loop_counter,
677                                      LLVMValueRef depth_ptr,
678                                      LLVMValueRef depth_stride,
679                                      LLVMValueRef z_value,
680                                      LLVMValueRef s_value)
681{
682   struct lp_build_context z_bld;
683   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
684   LLVMBuilderRef builder = gallivm->builder;
685   LLVMValueRef mask_value = NULL;
686   LLVMValueRef zs_dst1, zs_dst2;
687   LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
688   LLVMValueRef depth_offset1, depth_offset2;
689   LLVMTypeRef load_ptr_type;
690   unsigned depth_bytes = format_desc->block.bits / 8;
691   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
692   struct lp_type z_type = zs_type;
693   struct lp_type zs_load_type = zs_type;
694
695   zs_load_type.length = zs_load_type.length / 2;
696   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
697
698   z_type.width = z_src_type.width;
699
700   lp_build_context_init(&z_bld, gallivm, z_type);
701
702   /*
703    * This is far from ideal, at least for late depth write we should do this
704    * outside the fs loop to avoid all the swizzle stuff.
705    */
706   if (z_src_type.length == 4) {
707      LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
708                                          lp_build_const_int32(gallivm, 1), "");
709      LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
710                                          lp_build_const_int32(gallivm, 2), "");
711      LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
712                                          depth_stride, "");
713      depth_offset1 = LLVMBuildMul(builder, looplsb,
714                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
715      depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
716   }
717   else {
718      unsigned i;
719      LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
720                                         lp_build_const_int32(gallivm, 1), "");
721      assert(z_src_type.length == 8);
722      depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
723      /*
724       * We load 2x4 values, and need to swizzle them (order
725       * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
726       */
727      for (i = 0; i < 8; i++) {
728         shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
729      }
730   }
731
732   depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
733
734   zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
735   zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
736   zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
737   zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
738
739   if (format_desc->block.bits > 32) {
740      s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
741   }
742
743   if (mask) {
744      mask_value = lp_build_mask_value(mask);
745      z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
746      if (format_desc->block.bits > 32) {
747         s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
748         s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
749      }
750   }
751
752   if (zs_type.width < z_src_type.width) {
753      /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
754      z_value = LLVMBuildTrunc(builder, z_value,
755                               lp_build_int_vec_type(gallivm, zs_type), "");
756   }
757
758   if (format_desc->block.bits <= 32) {
759      if (z_src_type.length == 4) {
760         zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
761         zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
762      }
763      else {
764         assert(z_src_type.length == 8);
765         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
766                                          LLVMConstVector(&shuffles[0],
767                                                          zs_load_type.length), "");
768         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
769                                          LLVMConstVector(&shuffles[4],
770                                                          zs_load_type.length), "");
771      }
772   }
773   else {
774      if (z_src_type.length == 4) {
775         zs_dst1 = lp_build_interleave2(gallivm, z_type,
776                                        z_value, s_value, 0);
777         zs_dst2 = lp_build_interleave2(gallivm, z_type,
778                                        z_value, s_value, 1);
779      }
780      else {
781         unsigned i;
782         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
783         assert(z_src_type.length == 8);
784         for (i = 0; i < 8; i++) {
785            shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
786            shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
787                                                   z_src_type.length);
788         }
789         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
790                                          LLVMConstVector(&shuffles[0],
791                                                          z_src_type.length), "");
792         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
793                                          LLVMConstVector(&shuffles[8],
794                                                          z_src_type.length), "");
795      }
796      zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
797                                 lp_build_vec_type(gallivm, zs_load_type), "");
798      zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
799                                 lp_build_vec_type(gallivm, zs_load_type), "");
800   }
801
802   LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
803   if (!is_1d) {
804      LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
805   }
806}
807
808/**
809 * Generate code for performing depth and/or stencil tests.
810 * We operate on a vector of values (typically n 2x2 quads).
811 *
812 * \param depth  the depth test state
813 * \param stencil  the front/back stencil state
814 * \param type  the data type of the fragment depth/stencil values
815 * \param format_desc  description of the depth/stencil surface
816 * \param mask  the alive/dead pixel mask for the quad (vector)
817 * \param stencil_refs  the front/back stencil ref values (scalar)
818 * \param z_src  the incoming depth/stencil values (n 2x2 quad values, float32)
819 * \param zs_dst  the depth/stencil values in framebuffer
820 * \param face  contains boolean value indicating front/back facing polygon
821 */
822void
823lp_build_depth_stencil_test(struct gallivm_state *gallivm,
824                            const struct pipe_depth_state *depth,
825                            const struct pipe_stencil_state stencil[2],
826                            struct lp_type z_src_type,
827                            const struct util_format_description *format_desc,
828                            struct lp_build_mask_context *mask,
829                            LLVMValueRef stencil_refs[2],
830                            LLVMValueRef z_src,
831                            LLVMValueRef z_fb,
832                            LLVMValueRef s_fb,
833                            LLVMValueRef face,
834                            LLVMValueRef *z_value,
835                            LLVMValueRef *s_value,
836                            boolean do_branch)
837{
838   LLVMBuilderRef builder = gallivm->builder;
839   struct lp_type z_type;
840   struct lp_build_context z_bld;
841   struct lp_build_context s_bld;
842   struct lp_type s_type;
843   unsigned z_shift = 0, z_width = 0, z_mask = 0;
844   LLVMValueRef z_dst = NULL;
845   LLVMValueRef stencil_vals = NULL;
846   LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
847   LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
848   LLVMValueRef orig_mask = lp_build_mask_value(mask);
849   LLVMValueRef front_facing = NULL;
850   boolean have_z, have_s;
851
852   /*
853    * Depths are expected to be between 0 and 1, even if they are stored in
854    * floats. Setting these bits here will ensure that the lp_build_conv() call
855    * below won't try to unnecessarily clamp the incoming values.
856    */
857   if(z_src_type.floating) {
858      z_src_type.sign = FALSE;
859      z_src_type.norm = TRUE;
860   }
861   else {
862      assert(!z_src_type.sign);
863      assert(z_src_type.norm);
864   }
865
866   /* Pick the type matching the depth-stencil format. */
867   z_type = lp_depth_type(format_desc, z_src_type.length);
868
869   /* Pick the intermediate type for depth operations. */
870   z_type.width = z_src_type.width;
871   assert(z_type.length == z_src_type.length);
872
873   /* FIXME: for non-float depth/stencil might generate better code
874    * if we'd always split it up to use 128bit operations.
875    * For stencil we'd almost certainly want to pack to 8xi16 values,
876    * for z just run twice.
877    */
878
879   /* Sanity checking */
880   {
881      const unsigned z_swizzle = format_desc->swizzle[0];
882      const unsigned s_swizzle = format_desc->swizzle[1];
883
884      assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
885             s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
886
887      assert(depth->enabled || stencil[0].enabled);
888
889      assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
890      assert(format_desc->block.width == 1);
891      assert(format_desc->block.height == 1);
892
893      if (stencil[0].enabled) {
894         assert(s_swizzle < 4);
895         assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
896         assert(format_desc->channel[s_swizzle].pure_integer);
897         assert(!format_desc->channel[s_swizzle].normalized);
898         assert(format_desc->channel[s_swizzle].size == 8);
899      }
900
901      if (depth->enabled) {
902         assert(z_swizzle < 4);
903         if (z_type.floating) {
904            assert(z_swizzle == 0);
905            assert(format_desc->channel[z_swizzle].type ==
906                   UTIL_FORMAT_TYPE_FLOAT);
907            assert(format_desc->channel[z_swizzle].size == 32);
908         }
909         else {
910            assert(format_desc->channel[z_swizzle].type ==
911                   UTIL_FORMAT_TYPE_UNSIGNED);
912            assert(format_desc->channel[z_swizzle].normalized);
913            assert(!z_type.fixed);
914         }
915      }
916   }
917
918
919   /* Setup build context for Z vals */
920   lp_build_context_init(&z_bld, gallivm, z_type);
921
922   /* Setup build context for stencil vals */
923   s_type = lp_int_type(z_type);
924   lp_build_context_init(&s_bld, gallivm, s_type);
925
926   /* Compute and apply the Z/stencil bitmasks and shifts.
927    */
928   {
929      unsigned s_shift, s_mask;
930
931      z_dst = z_fb;
932      stencil_vals = s_fb;
933
934      have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
935      have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
936
937      if (have_z) {
938         if (z_mask != 0xffffffff) {
939            z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
940         }
941
942         /*
943          * Align the framebuffer Z 's LSB to the right.
944          */
945         if (z_shift) {
946            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
947            z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
948         } else if (z_bitmask) {
949            z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
950         } else {
951            lp_build_name(z_dst, "z_dst");
952         }
953      }
954
955      if (have_s) {
956         if (s_shift) {
957            LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
958            stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
959            stencil_shift = shift;  /* used below */
960         }
961
962         if (s_mask != 0xffffffff) {
963            LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
964            stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
965         }
966
967         lp_build_name(stencil_vals, "s_dst");
968      }
969   }
970
971   if (stencil[0].enabled) {
972
973      if (face) {
974         LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
975
976         /* front_facing = face != 0 ? ~0 : 0 */
977         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
978         front_facing = LLVMBuildSExt(builder, front_facing,
979                                      LLVMIntTypeInContext(gallivm->context,
980                                             s_bld.type.length*s_bld.type.width),
981                                      "");
982         front_facing = LLVMBuildBitCast(builder, front_facing,
983                                         s_bld.int_vec_type, "");
984      }
985
986      /* convert scalar stencil refs into vectors */
987      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
988      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
989
990      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
991                                          stencil_refs, stencil_vals,
992                                          front_facing);
993
994      /* apply stencil-fail operator */
995      {
996         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
997         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
998                                            stencil_refs, stencil_vals,
999                                            s_fail_mask, front_facing);
1000      }
1001   }
1002
1003   if (depth->enabled) {
1004      /*
1005       * Convert fragment Z to the desired type, aligning the LSB to the right.
1006       */
1007
1008      assert(z_type.width == z_src_type.width);
1009      assert(z_type.length == z_src_type.length);
1010      assert(lp_check_value(z_src_type, z_src));
1011      if (z_src_type.floating) {
1012         /*
1013          * Convert from floating point values
1014          */
1015
1016         if (!z_type.floating) {
1017            z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
1018                                                            z_src_type,
1019                                                            z_width,
1020                                                            z_src);
1021         }
1022      } else {
1023         /*
1024          * Convert from unsigned normalized values.
1025          */
1026
1027         assert(!z_src_type.sign);
1028         assert(!z_src_type.fixed);
1029         assert(z_src_type.norm);
1030         assert(!z_type.floating);
1031         if (z_src_type.width > z_width) {
1032            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
1033                                                        z_src_type.width - z_width);
1034            z_src = LLVMBuildLShr(builder, z_src, shift, "");
1035         }
1036      }
1037      assert(lp_check_value(z_type, z_src));
1038
1039      lp_build_name(z_src, "z_src");
1040
1041      /* compare src Z to dst Z, returning 'pass' mask */
1042      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
1043
1044      if (!stencil[0].enabled) {
1045         /* We can potentially skip all remaining operations here, but only
1046          * if stencil is disabled because we still need to update the stencil
1047          * buffer values.  Don't need to update Z buffer values.
1048          */
1049         lp_build_mask_update(mask, z_pass);
1050
1051         if (do_branch) {
1052            lp_build_mask_check(mask);
1053            do_branch = FALSE;
1054         }
1055      }
1056
1057      if (depth->writemask) {
1058         LLVMValueRef zselectmask;
1059
1060         /* mask off bits that failed Z test */
1061         zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
1062
1063         /* mask off bits that failed stencil test */
1064         if (s_pass_mask) {
1065            zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
1066         }
1067
1068         /* Mix the old and new Z buffer values.
1069          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
1070          */
1071         z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
1072      }
1073
1074      if (stencil[0].enabled) {
1075         /* update stencil buffer values according to z pass/fail result */
1076         LLVMValueRef z_fail_mask, z_pass_mask;
1077
1078         /* apply Z-fail operator */
1079         z_fail_mask = lp_build_andnot(&s_bld, orig_mask, z_pass);
1080         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
1081                                            stencil_refs, stencil_vals,
1082                                            z_fail_mask, front_facing);
1083
1084         /* apply Z-pass operator */
1085         z_pass_mask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
1086         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
1087                                            stencil_refs, stencil_vals,
1088                                            z_pass_mask, front_facing);
1089      }
1090   }
1091   else {
1092      /* No depth test: apply Z-pass operator to stencil buffer values which
1093       * passed the stencil test.
1094       */
1095      s_pass_mask = LLVMBuildAnd(builder, orig_mask, s_pass_mask, "");
1096      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
1097                                         stencil_refs, stencil_vals,
1098                                         s_pass_mask, front_facing);
1099   }
1100
1101   /* Put Z and stencil bits in the right place */
1102   if (have_z && z_shift) {
1103      LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
1104      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
1105   }
1106   if (stencil_vals && stencil_shift)
1107      stencil_vals = LLVMBuildShl(builder, stencil_vals,
1108                                  stencil_shift, "");
1109
1110   /* Finally, merge the z/stencil values */
1111   if (format_desc->block.bits <= 32) {
1112      if (have_z && have_s)
1113         *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
1114      else if (have_z)
1115         *z_value = z_dst;
1116      else
1117         *z_value = stencil_vals;
1118      *s_value = *z_value;
1119   }
1120   else {
1121      *z_value = z_dst;
1122      *s_value = stencil_vals;
1123   }
1124
1125   if (s_pass_mask)
1126      lp_build_mask_update(mask, s_pass_mask);
1127
1128   if (depth->enabled && stencil[0].enabled)
1129      lp_build_mask_update(mask, z_pass);
1130}
1131
1132