1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_init.h"
57#include "lp_bld_intr.h"
58#include "lp_bld_logic.h"
59#include "lp_bld_pack.h"
60#include "lp_bld_debug.h"
61#include "lp_bld_arit.h"
62
63
64#define EXP_POLY_DEGREE 5
65
66#define LOG_POLY_DEGREE 4
67
68
69/**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73static LLVMValueRef
74lp_build_min_simple(struct lp_build_context *bld,
75                    LLVMValueRef a,
76                    LLVMValueRef b)
77{
78   const struct lp_type type = bld->type;
79   const char *intrinsic = NULL;
80   unsigned intr_size = 0;
81   LLVMValueRef cond;
82
83   assert(lp_check_value(type, a));
84   assert(lp_check_value(type, b));
85
86   /* TODO: optimize the constant case */
87
88   if (type.floating && util_cpu_caps.has_sse) {
89      if (type.width == 32) {
90         if (type.length == 1) {
91            intrinsic = "llvm.x86.sse.min.ss";
92            intr_size = 128;
93         }
94         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
95            intrinsic = "llvm.x86.sse.min.ps";
96            intr_size = 128;
97         }
98         else {
99            intrinsic = "llvm.x86.avx.min.ps.256";
100            intr_size = 256;
101         }
102      }
103      if (type.width == 64 && util_cpu_caps.has_sse2) {
104         if (type.length == 1) {
105            intrinsic = "llvm.x86.sse2.min.sd";
106            intr_size = 128;
107         }
108         else if (type.length == 2 || !util_cpu_caps.has_avx) {
109            intrinsic = "llvm.x86.sse2.min.pd";
110            intr_size = 128;
111         }
112         else {
113            intrinsic = "llvm.x86.avx.min.pd.256";
114            intr_size = 256;
115         }
116      }
117   }
118   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
119      intr_size = 128;
120      if ((type.width == 8 || type.width == 16) &&
121          (type.width * type.length <= 64) &&
122          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
123         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
124                      __FUNCTION__);
125         }
126      if (type.width == 8 && !type.sign) {
127         intrinsic = "llvm.x86.sse2.pminu.b";
128      }
129      else if (type.width == 16 && type.sign) {
130         intrinsic = "llvm.x86.sse2.pmins.w";
131      }
132      if (util_cpu_caps.has_sse4_1) {
133         if (type.width == 8 && type.sign) {
134            intrinsic = "llvm.x86.sse41.pminsb";
135         }
136         if (type.width == 16 && !type.sign) {
137            intrinsic = "llvm.x86.sse41.pminuw";
138         }
139         if (type.width == 32 && !type.sign) {
140            intrinsic = "llvm.x86.sse41.pminud";
141        }
142         if (type.width == 32 && type.sign) {
143            intrinsic = "llvm.x86.sse41.pminsd";
144         }
145      }
146   }
147
148   if(intrinsic) {
149      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
150                                                 type,
151                                                 intr_size, a, b);
152   }
153
154   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
155   return lp_build_select(bld, cond, a, b);
156}
157
158
159/**
160 * Generate max(a, b)
161 * No checks for special case values of a or b = 1 or 0 are done.
162 */
163static LLVMValueRef
164lp_build_max_simple(struct lp_build_context *bld,
165                    LLVMValueRef a,
166                    LLVMValueRef b)
167{
168   const struct lp_type type = bld->type;
169   const char *intrinsic = NULL;
170   unsigned intr_size = 0;
171   LLVMValueRef cond;
172
173   assert(lp_check_value(type, a));
174   assert(lp_check_value(type, b));
175
176   /* TODO: optimize the constant case */
177
178   if (type.floating && util_cpu_caps.has_sse) {
179      if (type.width == 32) {
180         if (type.length == 1) {
181            intrinsic = "llvm.x86.sse.max.ss";
182            intr_size = 128;
183         }
184         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
185            intrinsic = "llvm.x86.sse.max.ps";
186            intr_size = 128;
187         }
188         else {
189            intrinsic = "llvm.x86.avx.max.ps.256";
190            intr_size = 256;
191         }
192      }
193      if (type.width == 64 && util_cpu_caps.has_sse2) {
194         if (type.length == 1) {
195            intrinsic = "llvm.x86.sse2.max.sd";
196            intr_size = 128;
197         }
198         else if (type.length == 2 || !util_cpu_caps.has_avx) {
199            intrinsic = "llvm.x86.sse2.max.pd";
200            intr_size = 128;
201         }
202         else {
203            intrinsic = "llvm.x86.avx.max.pd.256";
204            intr_size = 256;
205         }
206      }
207   }
208   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
209      intr_size = 128;
210      if ((type.width == 8 || type.width == 16) &&
211          (type.width * type.length <= 64) &&
212          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
213         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
214                      __FUNCTION__);
215         }
216      if (type.width == 8 && !type.sign) {
217         intrinsic = "llvm.x86.sse2.pmaxu.b";
218         intr_size = 128;
219      }
220      else if (type.width == 16 && type.sign) {
221         intrinsic = "llvm.x86.sse2.pmaxs.w";
222      }
223      if (util_cpu_caps.has_sse4_1) {
224         if (type.width == 8 && type.sign) {
225            intrinsic = "llvm.x86.sse41.pmaxsb";
226         }
227         if (type.width == 16 && !type.sign) {
228            intrinsic = "llvm.x86.sse41.pmaxuw";
229         }
230         if (type.width == 32 && !type.sign) {
231            intrinsic = "llvm.x86.sse41.pmaxud";
232        }
233         if (type.width == 32 && type.sign) {
234            intrinsic = "llvm.x86.sse41.pmaxsd";
235         }
236      }
237   }
238
239   if(intrinsic) {
240      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
241                                                 type,
242                                                 intr_size, a, b);
243   }
244
245   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
246   return lp_build_select(bld, cond, a, b);
247}
248
249
250/**
251 * Generate 1 - a, or ~a depending on bld->type.
252 */
253LLVMValueRef
254lp_build_comp(struct lp_build_context *bld,
255              LLVMValueRef a)
256{
257   LLVMBuilderRef builder = bld->gallivm->builder;
258   const struct lp_type type = bld->type;
259
260   assert(lp_check_value(type, a));
261
262   if(a == bld->one)
263      return bld->zero;
264   if(a == bld->zero)
265      return bld->one;
266
267   if(type.norm && !type.floating && !type.fixed && !type.sign) {
268      if(LLVMIsConstant(a))
269         return LLVMConstNot(a);
270      else
271         return LLVMBuildNot(builder, a, "");
272   }
273
274   if(LLVMIsConstant(a))
275      if (type.floating)
276          return LLVMConstFSub(bld->one, a);
277      else
278          return LLVMConstSub(bld->one, a);
279   else
280      if (type.floating)
281         return LLVMBuildFSub(builder, bld->one, a, "");
282      else
283         return LLVMBuildSub(builder, bld->one, a, "");
284}
285
286
287/**
288 * Generate a + b
289 */
290LLVMValueRef
291lp_build_add(struct lp_build_context *bld,
292             LLVMValueRef a,
293             LLVMValueRef b)
294{
295   LLVMBuilderRef builder = bld->gallivm->builder;
296   const struct lp_type type = bld->type;
297   LLVMValueRef res;
298
299   assert(lp_check_value(type, a));
300   assert(lp_check_value(type, b));
301
302   if(a == bld->zero)
303      return b;
304   if(b == bld->zero)
305      return a;
306   if(a == bld->undef || b == bld->undef)
307      return bld->undef;
308
309   if(bld->type.norm) {
310      const char *intrinsic = NULL;
311
312      if(a == bld->one || b == bld->one)
313        return bld->one;
314
315      if(util_cpu_caps.has_sse2 &&
316         type.width * type.length == 128 &&
317         !type.floating && !type.fixed) {
318         if(type.width == 8)
319            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
320         if(type.width == 16)
321            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
322      }
323
324      if(intrinsic)
325         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
326   }
327
328   if(LLVMIsConstant(a) && LLVMIsConstant(b))
329      if (type.floating)
330         res = LLVMConstFAdd(a, b);
331      else
332         res = LLVMConstAdd(a, b);
333   else
334      if (type.floating)
335         res = LLVMBuildFAdd(builder, a, b, "");
336      else
337         res = LLVMBuildAdd(builder, a, b, "");
338
339   /* clamp to ceiling of 1.0 */
340   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
341      res = lp_build_min_simple(bld, res, bld->one);
342
343   /* XXX clamp to floor of -1 or 0??? */
344
345   return res;
346}
347
348
349/** Return the scalar sum of the elements of a.
350 * Should avoid this operation whenever possible.
351 */
352LLVMValueRef
353lp_build_horizontal_add(struct lp_build_context *bld,
354                        LLVMValueRef a)
355{
356   LLVMBuilderRef builder = bld->gallivm->builder;
357   const struct lp_type type = bld->type;
358   LLVMValueRef index, res;
359   unsigned i, length;
360   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
361   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
362   LLVMValueRef vecres, elem2;
363
364   assert(lp_check_value(type, a));
365
366   if (type.length == 1) {
367      return a;
368   }
369
370   assert(!bld->type.norm);
371
372   /*
373    * for byte vectors can do much better with psadbw.
374    * Using repeated shuffle/adds here. Note with multiple vectors
375    * this can be done more efficiently as outlined in the intel
376    * optimization manual.
377    * Note: could cause data rearrangement if used with smaller element
378    * sizes.
379    */
380
381   vecres = a;
382   length = type.length / 2;
383   while (length > 1) {
384      LLVMValueRef vec1, vec2;
385      for (i = 0; i < length; i++) {
386         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
387         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
388      }
389      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
390                                    LLVMConstVector(shuffles1, length), "");
391      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
392                                    LLVMConstVector(shuffles2, length), "");
393      if (type.floating) {
394         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
395      }
396      else {
397         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
398      }
399      length = length >> 1;
400   }
401
402   /* always have vector of size 2 here */
403   assert(length == 1);
404
405   index = lp_build_const_int32(bld->gallivm, 0);
406   res = LLVMBuildExtractElement(builder, vecres, index, "");
407   index = lp_build_const_int32(bld->gallivm, 1);
408   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
409
410   if (type.floating)
411      res = LLVMBuildFAdd(builder, res, elem2, "");
412    else
413      res = LLVMBuildAdd(builder, res, elem2, "");
414
415   return res;
416}
417
418/**
419 * Return the horizontal sums of 4 float vectors as a float4 vector.
420 * This uses the technique as outlined in Intel Optimization Manual.
421 */
422static LLVMValueRef
423lp_build_horizontal_add4x4f(struct lp_build_context *bld,
424                            LLVMValueRef src[4])
425{
426   struct gallivm_state *gallivm = bld->gallivm;
427   LLVMBuilderRef builder = gallivm->builder;
428   LLVMValueRef shuffles[4];
429   LLVMValueRef tmp[4];
430   LLVMValueRef sumtmp[2], shuftmp[2];
431
432   /* lower half of regs */
433   shuffles[0] = lp_build_const_int32(gallivm, 0);
434   shuffles[1] = lp_build_const_int32(gallivm, 1);
435   shuffles[2] = lp_build_const_int32(gallivm, 4);
436   shuffles[3] = lp_build_const_int32(gallivm, 5);
437   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
438                                   LLVMConstVector(shuffles, 4), "");
439   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
440                                   LLVMConstVector(shuffles, 4), "");
441
442   /* upper half of regs */
443   shuffles[0] = lp_build_const_int32(gallivm, 2);
444   shuffles[1] = lp_build_const_int32(gallivm, 3);
445   shuffles[2] = lp_build_const_int32(gallivm, 6);
446   shuffles[3] = lp_build_const_int32(gallivm, 7);
447   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
448                                   LLVMConstVector(shuffles, 4), "");
449   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
450                                   LLVMConstVector(shuffles, 4), "");
451
452   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
453   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
454
455   shuffles[0] = lp_build_const_int32(gallivm, 0);
456   shuffles[1] = lp_build_const_int32(gallivm, 2);
457   shuffles[2] = lp_build_const_int32(gallivm, 4);
458   shuffles[3] = lp_build_const_int32(gallivm, 6);
459   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
460                                       LLVMConstVector(shuffles, 4), "");
461
462   shuffles[0] = lp_build_const_int32(gallivm, 1);
463   shuffles[1] = lp_build_const_int32(gallivm, 3);
464   shuffles[2] = lp_build_const_int32(gallivm, 5);
465   shuffles[3] = lp_build_const_int32(gallivm, 7);
466   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
467                                       LLVMConstVector(shuffles, 4), "");
468
469   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
470}
471
472
473/*
474 * partially horizontally add 2-4 float vectors with length nx4,
475 * i.e. only four adjacent values in each vector will be added,
476 * assuming values are really grouped in 4 which also determines
477 * output order.
478 *
479 * Return a vector of the same length as the initial vectors,
480 * with the excess elements (if any) being undefined.
481 * The element order is independent of number of input vectors.
482 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
483 * the output order thus will be
484 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
485 */
486LLVMValueRef
487lp_build_hadd_partial4(struct lp_build_context *bld,
488                       LLVMValueRef vectors[],
489                       unsigned num_vecs)
490{
491   struct gallivm_state *gallivm = bld->gallivm;
492   LLVMBuilderRef builder = gallivm->builder;
493   LLVMValueRef ret_vec;
494   LLVMValueRef tmp[4];
495   const char *intrinsic = NULL;
496
497   assert(num_vecs >= 2 && num_vecs <= 4);
498   assert(bld->type.floating);
499
500   /* only use this with at least 2 vectors, as it is sort of expensive
501    * (depending on cpu) and we always need two horizontal adds anyway,
502    * so a shuffle/add approach might be better.
503    */
504
505   tmp[0] = vectors[0];
506   tmp[1] = vectors[1];
507
508   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
509   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
510
511   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
512       bld->type.length == 4) {
513      intrinsic = "llvm.x86.sse3.hadd.ps";
514   }
515   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
516            bld->type.length == 8) {
517      intrinsic = "llvm.x86.avx.hadd.ps.256";
518   }
519   if (intrinsic) {
520      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
521                                       lp_build_vec_type(gallivm, bld->type),
522                                       tmp[0], tmp[1]);
523      if (num_vecs > 2) {
524         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
525                                          lp_build_vec_type(gallivm, bld->type),
526                                          tmp[2], tmp[3]);
527      }
528      else {
529         tmp[1] = tmp[0];
530      }
531      return lp_build_intrinsic_binary(builder, intrinsic,
532                                       lp_build_vec_type(gallivm, bld->type),
533                                       tmp[0], tmp[1]);
534   }
535
536   if (bld->type.length == 4) {
537      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
538   }
539   else {
540      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
541      unsigned j;
542      unsigned num_iter = bld->type.length / 4;
543      struct lp_type parttype = bld->type;
544      parttype.length = 4;
545      for (j = 0; j < num_iter; j++) {
546         LLVMValueRef partsrc[4];
547         unsigned i;
548         for (i = 0; i < 4; i++) {
549            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
550         }
551         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
552      }
553      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
554   }
555   return ret_vec;
556}
557
558/**
559 * Generate a - b
560 */
561LLVMValueRef
562lp_build_sub(struct lp_build_context *bld,
563             LLVMValueRef a,
564             LLVMValueRef b)
565{
566   LLVMBuilderRef builder = bld->gallivm->builder;
567   const struct lp_type type = bld->type;
568   LLVMValueRef res;
569
570   assert(lp_check_value(type, a));
571   assert(lp_check_value(type, b));
572
573   if(b == bld->zero)
574      return a;
575   if(a == bld->undef || b == bld->undef)
576      return bld->undef;
577   if(a == b)
578      return bld->zero;
579
580   if(bld->type.norm) {
581      const char *intrinsic = NULL;
582
583      if(b == bld->one)
584        return bld->zero;
585
586      if(util_cpu_caps.has_sse2 &&
587         type.width * type.length == 128 &&
588         !type.floating && !type.fixed) {
589         if(type.width == 8)
590            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
591         if(type.width == 16)
592            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
593      }
594
595      if(intrinsic)
596         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
597   }
598
599   if(LLVMIsConstant(a) && LLVMIsConstant(b))
600      if (type.floating)
601         res = LLVMConstFSub(a, b);
602      else
603         res = LLVMConstSub(a, b);
604   else
605      if (type.floating)
606         res = LLVMBuildFSub(builder, a, b, "");
607      else
608         res = LLVMBuildSub(builder, a, b, "");
609
610   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
611      res = lp_build_max_simple(bld, res, bld->zero);
612
613   return res;
614}
615
616
617/**
618 * Normalized 8bit multiplication.
619 *
620 * - alpha plus one
621 *
622 *     makes the following approximation to the division (Sree)
623 *
624 *       a*b/255 ~= (a*(b + 1)) >> 256
625 *
626 *     which is the fastest method that satisfies the following OpenGL criteria
627 *
628 *       0*0 = 0 and 255*255 = 255
629 *
630 * - geometric series
631 *
632 *     takes the geometric series approximation to the division
633 *
634 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
635 *
636 *     in this case just the first two terms to fit in 16bit arithmetic
637 *
638 *       t/255 ~= (t + (t >> 8)) >> 8
639 *
640 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
641 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
642 *     must be used
643 *
644 * - geometric series plus rounding
645 *
646 *     when using a geometric series division instead of truncating the result
647 *     use roundoff in the approximation (Jim Blinn)
648 *
649 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
650 *
651 *     achieving the exact results
652 *
653 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
654 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
655 * @sa Michael Herf, The "double blend trick", May 2000,
656 *     http://www.stereopsis.com/doubleblend.html
657 */
658static LLVMValueRef
659lp_build_mul_u8n(struct gallivm_state *gallivm,
660                 struct lp_type i16_type,
661                 LLVMValueRef a, LLVMValueRef b)
662{
663   LLVMBuilderRef builder = gallivm->builder;
664   LLVMValueRef c8;
665   LLVMValueRef ab;
666
667   assert(!i16_type.floating);
668   assert(lp_check_value(i16_type, a));
669   assert(lp_check_value(i16_type, b));
670
671   c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
672
673#if 0
674
675   /* a*b/255 ~= (a*(b + 1)) >> 256 */
676   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
677   ab = LLVMBuildMul(builder, a, b, "");
678
679#else
680
681   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
682   ab = LLVMBuildMul(builder, a, b, "");
683   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
684   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
685
686#endif
687
688   ab = LLVMBuildLShr(builder, ab, c8, "");
689
690   return ab;
691}
692
693
694/**
695 * Generate a * b
696 */
697LLVMValueRef
698lp_build_mul(struct lp_build_context *bld,
699             LLVMValueRef a,
700             LLVMValueRef b)
701{
702   LLVMBuilderRef builder = bld->gallivm->builder;
703   const struct lp_type type = bld->type;
704   LLVMValueRef shift;
705   LLVMValueRef res;
706
707   assert(lp_check_value(type, a));
708   assert(lp_check_value(type, b));
709
710   if(a == bld->zero)
711      return bld->zero;
712   if(a == bld->one)
713      return b;
714   if(b == bld->zero)
715      return bld->zero;
716   if(b == bld->one)
717      return a;
718   if(a == bld->undef || b == bld->undef)
719      return bld->undef;
720
721   if(!type.floating && !type.fixed && type.norm) {
722      if(type.width == 8) {
723         struct lp_type i16_type = lp_wider_type(type);
724         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
725
726         lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
727         lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
728
729         /* PMULLW, PSRLW, PADDW */
730         abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
731         abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
732
733         ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
734
735         return ab;
736      }
737
738      /* FIXME */
739      assert(0);
740   }
741
742   if(type.fixed)
743      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
744   else
745      shift = NULL;
746
747   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
748      if (type.floating)
749         res = LLVMConstFMul(a, b);
750      else
751         res = LLVMConstMul(a, b);
752      if(shift) {
753         if(type.sign)
754            res = LLVMConstAShr(res, shift);
755         else
756            res = LLVMConstLShr(res, shift);
757      }
758   }
759   else {
760      if (type.floating)
761         res = LLVMBuildFMul(builder, a, b, "");
762      else
763         res = LLVMBuildMul(builder, a, b, "");
764      if(shift) {
765         if(type.sign)
766            res = LLVMBuildAShr(builder, res, shift, "");
767         else
768            res = LLVMBuildLShr(builder, res, shift, "");
769      }
770   }
771
772   return res;
773}
774
775
776/**
777 * Small vector x scale multiplication optimization.
778 */
779LLVMValueRef
780lp_build_mul_imm(struct lp_build_context *bld,
781                 LLVMValueRef a,
782                 int b)
783{
784   LLVMBuilderRef builder = bld->gallivm->builder;
785   LLVMValueRef factor;
786
787   assert(lp_check_value(bld->type, a));
788
789   if(b == 0)
790      return bld->zero;
791
792   if(b == 1)
793      return a;
794
795   if(b == -1)
796      return lp_build_negate(bld, a);
797
798   if(b == 2 && bld->type.floating)
799      return lp_build_add(bld, a, a);
800
801   if(util_is_power_of_two(b)) {
802      unsigned shift = ffs(b) - 1;
803
804      if(bld->type.floating) {
805#if 0
806         /*
807          * Power of two multiplication by directly manipulating the exponent.
808          *
809          * XXX: This might not be always faster, it will introduce a small error
810          * for multiplication by zero, and it will produce wrong results
811          * for Inf and NaN.
812          */
813         unsigned mantissa = lp_mantissa(bld->type);
814         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
815         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
816         a = LLVMBuildAdd(builder, a, factor, "");
817         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
818         return a;
819#endif
820      }
821      else {
822         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
823         return LLVMBuildShl(builder, a, factor, "");
824      }
825   }
826
827   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
828   return lp_build_mul(bld, a, factor);
829}
830
831
832/**
833 * Generate a / b
834 */
835LLVMValueRef
836lp_build_div(struct lp_build_context *bld,
837             LLVMValueRef a,
838             LLVMValueRef b)
839{
840   LLVMBuilderRef builder = bld->gallivm->builder;
841   const struct lp_type type = bld->type;
842
843   assert(lp_check_value(type, a));
844   assert(lp_check_value(type, b));
845
846   if(a == bld->zero)
847      return bld->zero;
848   if(a == bld->one)
849      return lp_build_rcp(bld, b);
850   if(b == bld->zero)
851      return bld->undef;
852   if(b == bld->one)
853      return a;
854   if(a == bld->undef || b == bld->undef)
855      return bld->undef;
856
857   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
858      if (type.floating)
859         return LLVMConstFDiv(a, b);
860      else if (type.sign)
861         return LLVMConstSDiv(a, b);
862      else
863         return LLVMConstUDiv(a, b);
864   }
865
866   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
867       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
868      type.floating)
869      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
870
871   if (type.floating)
872      return LLVMBuildFDiv(builder, a, b, "");
873   else if (type.sign)
874      return LLVMBuildSDiv(builder, a, b, "");
875   else
876      return LLVMBuildUDiv(builder, a, b, "");
877}
878
879
880/**
881 * Linear interpolation -- without any checks.
882 *
883 * @sa http://www.stereopsis.com/doubleblend.html
884 */
885static INLINE LLVMValueRef
886lp_build_lerp_simple(struct lp_build_context *bld,
887                     LLVMValueRef x,
888                     LLVMValueRef v0,
889                     LLVMValueRef v1)
890{
891   LLVMBuilderRef builder = bld->gallivm->builder;
892   LLVMValueRef delta;
893   LLVMValueRef res;
894
895   assert(lp_check_value(bld->type, x));
896   assert(lp_check_value(bld->type, v0));
897   assert(lp_check_value(bld->type, v1));
898
899   delta = lp_build_sub(bld, v1, v0);
900
901   res = lp_build_mul(bld, x, delta);
902
903   res = lp_build_add(bld, v0, res);
904
905   if (bld->type.fixed) {
906      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
907       * but it will be wrong for other uses. Basically we need a more
908       * powerful lp_type, capable of further distinguishing the values
909       * interpretation from the value storage. */
910      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
911   }
912
913   return res;
914}
915
916
917/**
918 * Linear interpolation.
919 */
920LLVMValueRef
921lp_build_lerp(struct lp_build_context *bld,
922              LLVMValueRef x,
923              LLVMValueRef v0,
924              LLVMValueRef v1)
925{
926   LLVMBuilderRef builder = bld->gallivm->builder;
927   const struct lp_type type = bld->type;
928   LLVMValueRef res;
929
930   assert(lp_check_value(type, x));
931   assert(lp_check_value(type, v0));
932   assert(lp_check_value(type, v1));
933
934   if (type.norm) {
935      struct lp_type wide_type;
936      struct lp_build_context wide_bld;
937      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
938      LLVMValueRef shift;
939
940      assert(type.length >= 2);
941      assert(!type.sign);
942
943      /*
944       * Create a wider type, enough to hold the intermediate result of the
945       * multiplication.
946       */
947      memset(&wide_type, 0, sizeof wide_type);
948      wide_type.fixed  = TRUE;
949      wide_type.width  = type.width*2;
950      wide_type.length = type.length/2;
951
952      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
953
954      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
955      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
956      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
957
958      /*
959       * Scale x from [0, 255] to [0, 256]
960       */
961
962      shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
963
964      xl = lp_build_add(&wide_bld, xl,
965                        LLVMBuildAShr(builder, xl, shift, ""));
966      xh = lp_build_add(&wide_bld, xh,
967                        LLVMBuildAShr(builder, xh, shift, ""));
968
969      /*
970       * Lerp both halves.
971       */
972
973      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
974      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
975
976      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
977   } else {
978      res = lp_build_lerp_simple(bld, x, v0, v1);
979   }
980
981   return res;
982}
983
984
985LLVMValueRef
986lp_build_lerp_2d(struct lp_build_context *bld,
987                 LLVMValueRef x,
988                 LLVMValueRef y,
989                 LLVMValueRef v00,
990                 LLVMValueRef v01,
991                 LLVMValueRef v10,
992                 LLVMValueRef v11)
993{
994   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
995   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
996   return lp_build_lerp(bld, y, v0, v1);
997}
998
999
1000/**
1001 * Generate min(a, b)
1002 * Do checks for special cases.
1003 */
1004LLVMValueRef
1005lp_build_min(struct lp_build_context *bld,
1006             LLVMValueRef a,
1007             LLVMValueRef b)
1008{
1009   assert(lp_check_value(bld->type, a));
1010   assert(lp_check_value(bld->type, b));
1011
1012   if(a == bld->undef || b == bld->undef)
1013      return bld->undef;
1014
1015   if(a == b)
1016      return a;
1017
1018   if (bld->type.norm) {
1019      if (!bld->type.sign) {
1020         if (a == bld->zero || b == bld->zero) {
1021            return bld->zero;
1022         }
1023      }
1024      if(a == bld->one)
1025         return b;
1026      if(b == bld->one)
1027         return a;
1028   }
1029
1030   return lp_build_min_simple(bld, a, b);
1031}
1032
1033
1034/**
1035 * Generate max(a, b)
1036 * Do checks for special cases.
1037 */
1038LLVMValueRef
1039lp_build_max(struct lp_build_context *bld,
1040             LLVMValueRef a,
1041             LLVMValueRef b)
1042{
1043   assert(lp_check_value(bld->type, a));
1044   assert(lp_check_value(bld->type, b));
1045
1046   if(a == bld->undef || b == bld->undef)
1047      return bld->undef;
1048
1049   if(a == b)
1050      return a;
1051
1052   if(bld->type.norm) {
1053      if(a == bld->one || b == bld->one)
1054         return bld->one;
1055      if (!bld->type.sign) {
1056         if (a == bld->zero) {
1057            return b;
1058         }
1059         if (b == bld->zero) {
1060            return a;
1061         }
1062      }
1063   }
1064
1065   return lp_build_max_simple(bld, a, b);
1066}
1067
1068
1069/**
1070 * Generate clamp(a, min, max)
1071 * Do checks for special cases.
1072 */
1073LLVMValueRef
1074lp_build_clamp(struct lp_build_context *bld,
1075               LLVMValueRef a,
1076               LLVMValueRef min,
1077               LLVMValueRef max)
1078{
1079   assert(lp_check_value(bld->type, a));
1080   assert(lp_check_value(bld->type, min));
1081   assert(lp_check_value(bld->type, max));
1082
1083   a = lp_build_min(bld, a, max);
1084   a = lp_build_max(bld, a, min);
1085   return a;
1086}
1087
1088
1089/**
1090 * Generate abs(a)
1091 */
1092LLVMValueRef
1093lp_build_abs(struct lp_build_context *bld,
1094             LLVMValueRef a)
1095{
1096   LLVMBuilderRef builder = bld->gallivm->builder;
1097   const struct lp_type type = bld->type;
1098   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1099
1100   assert(lp_check_value(type, a));
1101
1102   if(!type.sign)
1103      return a;
1104
1105   if(type.floating) {
1106      /* Mask out the sign bit */
1107      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1108      unsigned long long absMask = ~(1ULL << (type.width - 1));
1109      LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1110      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1111      a = LLVMBuildAnd(builder, a, mask, "");
1112      a = LLVMBuildBitCast(builder, a, vec_type, "");
1113      return a;
1114   }
1115
1116   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1117      switch(type.width) {
1118      case 8:
1119         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1120      case 16:
1121         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1122      case 32:
1123         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1124      }
1125   }
1126   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1127            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1128            (type.width == 8 || type.width == 16 || type.width == 32)) {
1129      debug_printf("%s: inefficient code, should split vectors manually\n",
1130                   __FUNCTION__);
1131   }
1132
1133   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1134}
1135
1136
1137LLVMValueRef
1138lp_build_negate(struct lp_build_context *bld,
1139                LLVMValueRef a)
1140{
1141   LLVMBuilderRef builder = bld->gallivm->builder;
1142
1143   assert(lp_check_value(bld->type, a));
1144
1145#if HAVE_LLVM >= 0x0207
1146   if (bld->type.floating)
1147      a = LLVMBuildFNeg(builder, a, "");
1148   else
1149#endif
1150      a = LLVMBuildNeg(builder, a, "");
1151
1152   return a;
1153}
1154
1155
1156/** Return -1, 0 or +1 depending on the sign of a */
1157LLVMValueRef
1158lp_build_sgn(struct lp_build_context *bld,
1159             LLVMValueRef a)
1160{
1161   LLVMBuilderRef builder = bld->gallivm->builder;
1162   const struct lp_type type = bld->type;
1163   LLVMValueRef cond;
1164   LLVMValueRef res;
1165
1166   assert(lp_check_value(type, a));
1167
1168   /* Handle non-zero case */
1169   if(!type.sign) {
1170      /* if not zero then sign must be positive */
1171      res = bld->one;
1172   }
1173   else if(type.floating) {
1174      LLVMTypeRef vec_type;
1175      LLVMTypeRef int_type;
1176      LLVMValueRef mask;
1177      LLVMValueRef sign;
1178      LLVMValueRef one;
1179      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1180
1181      int_type = lp_build_int_vec_type(bld->gallivm, type);
1182      vec_type = lp_build_vec_type(bld->gallivm, type);
1183      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1184
1185      /* Take the sign bit and add it to 1 constant */
1186      sign = LLVMBuildBitCast(builder, a, int_type, "");
1187      sign = LLVMBuildAnd(builder, sign, mask, "");
1188      one = LLVMConstBitCast(bld->one, int_type);
1189      res = LLVMBuildOr(builder, sign, one, "");
1190      res = LLVMBuildBitCast(builder, res, vec_type, "");
1191   }
1192   else
1193   {
1194      /* signed int/norm/fixed point */
1195      /* could use psign with sse3 and appropriate vectors here */
1196      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1197      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1198      res = lp_build_select(bld, cond, bld->one, minus_one);
1199   }
1200
1201   /* Handle zero */
1202   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1203   res = lp_build_select(bld, cond, bld->zero, res);
1204
1205   return res;
1206}
1207
1208
1209/**
1210 * Set the sign of float vector 'a' according to 'sign'.
1211 * If sign==0, return abs(a).
1212 * If sign==1, return -abs(a);
1213 * Other values for sign produce undefined results.
1214 */
1215LLVMValueRef
1216lp_build_set_sign(struct lp_build_context *bld,
1217                  LLVMValueRef a, LLVMValueRef sign)
1218{
1219   LLVMBuilderRef builder = bld->gallivm->builder;
1220   const struct lp_type type = bld->type;
1221   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1222   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1223   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1224   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1225                             ~((unsigned long long) 1 << (type.width - 1)));
1226   LLVMValueRef val, res;
1227
1228   assert(type.floating);
1229   assert(lp_check_value(type, a));
1230
1231   /* val = reinterpret_cast<int>(a) */
1232   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1233   /* val = val & mask */
1234   val = LLVMBuildAnd(builder, val, mask, "");
1235   /* sign = sign << shift */
1236   sign = LLVMBuildShl(builder, sign, shift, "");
1237   /* res = val | sign */
1238   res = LLVMBuildOr(builder, val, sign, "");
1239   /* res = reinterpret_cast<float>(res) */
1240   res = LLVMBuildBitCast(builder, res, vec_type, "");
1241
1242   return res;
1243}
1244
1245
1246/**
1247 * Convert vector of (or scalar) int to vector of (or scalar) float.
1248 */
1249LLVMValueRef
1250lp_build_int_to_float(struct lp_build_context *bld,
1251                      LLVMValueRef a)
1252{
1253   LLVMBuilderRef builder = bld->gallivm->builder;
1254   const struct lp_type type = bld->type;
1255   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1256
1257   assert(type.floating);
1258
1259   return LLVMBuildSIToFP(builder, a, vec_type, "");
1260}
1261
1262static boolean
1263sse41_rounding_available(const struct lp_type type)
1264{
1265   if ((util_cpu_caps.has_sse4_1 &&
1266       (type.length == 1 || type.width*type.length == 128)) ||
1267       (util_cpu_caps.has_avx && type.width*type.length == 256))
1268      return TRUE;
1269
1270   return FALSE;
1271}
1272
1273enum lp_build_round_sse41_mode
1274{
1275   LP_BUILD_ROUND_SSE41_NEAREST = 0,
1276   LP_BUILD_ROUND_SSE41_FLOOR = 1,
1277   LP_BUILD_ROUND_SSE41_CEIL = 2,
1278   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1279};
1280
1281
1282/**
1283 * Helper for SSE4.1's ROUNDxx instructions.
1284 *
1285 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1286 * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1287 */
1288static INLINE LLVMValueRef
1289lp_build_round_sse41(struct lp_build_context *bld,
1290                     LLVMValueRef a,
1291                     enum lp_build_round_sse41_mode mode)
1292{
1293   LLVMBuilderRef builder = bld->gallivm->builder;
1294   const struct lp_type type = bld->type;
1295   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1296   const char *intrinsic;
1297   LLVMValueRef res;
1298
1299   assert(type.floating);
1300
1301   assert(lp_check_value(type, a));
1302   assert(util_cpu_caps.has_sse4_1);
1303
1304   if (type.length == 1) {
1305      LLVMTypeRef vec_type;
1306      LLVMValueRef undef;
1307      LLVMValueRef args[3];
1308      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1309
1310      switch(type.width) {
1311      case 32:
1312         intrinsic = "llvm.x86.sse41.round.ss";
1313         break;
1314      case 64:
1315         intrinsic = "llvm.x86.sse41.round.sd";
1316         break;
1317      default:
1318         assert(0);
1319         return bld->undef;
1320      }
1321
1322      vec_type = LLVMVectorType(bld->elem_type, 4);
1323
1324      undef = LLVMGetUndef(vec_type);
1325
1326      args[0] = undef;
1327      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1328      args[2] = LLVMConstInt(i32t, mode, 0);
1329
1330      res = lp_build_intrinsic(builder, intrinsic,
1331                               vec_type, args, Elements(args));
1332
1333      res = LLVMBuildExtractElement(builder, res, index0, "");
1334   }
1335   else {
1336      if (type.width * type.length == 128) {
1337         switch(type.width) {
1338         case 32:
1339            intrinsic = "llvm.x86.sse41.round.ps";
1340            break;
1341         case 64:
1342            intrinsic = "llvm.x86.sse41.round.pd";
1343            break;
1344         default:
1345            assert(0);
1346            return bld->undef;
1347         }
1348      }
1349      else {
1350         assert(type.width * type.length == 256);
1351         assert(util_cpu_caps.has_avx);
1352
1353         switch(type.width) {
1354         case 32:
1355            intrinsic = "llvm.x86.avx.round.ps.256";
1356            break;
1357         case 64:
1358            intrinsic = "llvm.x86.avx.round.pd.256";
1359            break;
1360         default:
1361            assert(0);
1362            return bld->undef;
1363         }
1364      }
1365
1366      res = lp_build_intrinsic_binary(builder, intrinsic,
1367                                      bld->vec_type, a,
1368                                      LLVMConstInt(i32t, mode, 0));
1369   }
1370
1371   return res;
1372}
1373
1374
1375static INLINE LLVMValueRef
1376lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1377                             LLVMValueRef a)
1378{
1379   LLVMBuilderRef builder = bld->gallivm->builder;
1380   const struct lp_type type = bld->type;
1381   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1382   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1383   const char *intrinsic;
1384   LLVMValueRef res;
1385
1386   assert(type.floating);
1387   /* using the double precision conversions is a bit more complicated */
1388   assert(type.width == 32);
1389
1390   assert(lp_check_value(type, a));
1391   assert(util_cpu_caps.has_sse2);
1392
1393   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1394   if (type.length == 1) {
1395      LLVMTypeRef vec_type;
1396      LLVMValueRef undef;
1397      LLVMValueRef arg;
1398      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1399
1400      vec_type = LLVMVectorType(bld->elem_type, 4);
1401
1402      intrinsic = "llvm.x86.sse.cvtss2si";
1403
1404      undef = LLVMGetUndef(vec_type);
1405
1406      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1407
1408      res = lp_build_intrinsic_unary(builder, intrinsic,
1409                                     ret_type, arg);
1410   }
1411   else {
1412      if (type.width* type.length == 128) {
1413         intrinsic = "llvm.x86.sse2.cvtps2dq";
1414      }
1415      else {
1416         assert(type.width*type.length == 256);
1417         assert(util_cpu_caps.has_avx);
1418
1419         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1420      }
1421      res = lp_build_intrinsic_unary(builder, intrinsic,
1422                                     ret_type, a);
1423   }
1424
1425   return res;
1426}
1427
1428
1429/**
1430 * Return the integer part of a float (vector) value (== round toward zero).
1431 * The returned value is a float (vector).
1432 * Ex: trunc(-1.5) = -1.0
1433 */
1434LLVMValueRef
1435lp_build_trunc(struct lp_build_context *bld,
1436               LLVMValueRef a)
1437{
1438   LLVMBuilderRef builder = bld->gallivm->builder;
1439   const struct lp_type type = bld->type;
1440
1441   assert(type.floating);
1442   assert(lp_check_value(type, a));
1443
1444   if (sse41_rounding_available(type)) {
1445      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1446   }
1447   else {
1448      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1449      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1450      LLVMValueRef res;
1451      res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1452      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1453      return res;
1454   }
1455}
1456
1457
1458/**
1459 * Return float (vector) rounded to nearest integer (vector).  The returned
1460 * value is a float (vector).
1461 * Ex: round(0.9) = 1.0
1462 * Ex: round(-1.5) = -2.0
1463 */
1464LLVMValueRef
1465lp_build_round(struct lp_build_context *bld,
1466               LLVMValueRef a)
1467{
1468   LLVMBuilderRef builder = bld->gallivm->builder;
1469   const struct lp_type type = bld->type;
1470
1471   assert(type.floating);
1472   assert(lp_check_value(type, a));
1473
1474   if (sse41_rounding_available(type)) {
1475      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1476   }
1477   else {
1478      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1479      LLVMValueRef res;
1480      res = lp_build_iround(bld, a);
1481      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1482      return res;
1483   }
1484}
1485
1486
1487/**
1488 * Return floor of float (vector), result is a float (vector)
1489 * Ex: floor(1.1) = 1.0
1490 * Ex: floor(-1.1) = -2.0
1491 */
1492LLVMValueRef
1493lp_build_floor(struct lp_build_context *bld,
1494               LLVMValueRef a)
1495{
1496   LLVMBuilderRef builder = bld->gallivm->builder;
1497   const struct lp_type type = bld->type;
1498
1499   assert(type.floating);
1500   assert(lp_check_value(type, a));
1501
1502   if (sse41_rounding_available(type)) {
1503      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1504   }
1505   else {
1506      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1507      LLVMValueRef res;
1508      res = lp_build_ifloor(bld, a);
1509      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1510      return res;
1511   }
1512}
1513
1514
1515/**
1516 * Return ceiling of float (vector), returning float (vector).
1517 * Ex: ceil( 1.1) = 2.0
1518 * Ex: ceil(-1.1) = -1.0
1519 */
1520LLVMValueRef
1521lp_build_ceil(struct lp_build_context *bld,
1522              LLVMValueRef a)
1523{
1524   LLVMBuilderRef builder = bld->gallivm->builder;
1525   const struct lp_type type = bld->type;
1526
1527   assert(type.floating);
1528   assert(lp_check_value(type, a));
1529
1530   if (sse41_rounding_available(type)) {
1531      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1532   }
1533   else {
1534      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1535      LLVMValueRef res;
1536      res = lp_build_iceil(bld, a);
1537      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1538      return res;
1539   }
1540}
1541
1542
1543/**
1544 * Return fractional part of 'a' computed as a - floor(a)
1545 * Typically used in texture coord arithmetic.
1546 */
1547LLVMValueRef
1548lp_build_fract(struct lp_build_context *bld,
1549               LLVMValueRef a)
1550{
1551   assert(bld->type.floating);
1552   return lp_build_sub(bld, a, lp_build_floor(bld, a));
1553}
1554
1555
1556/**
1557 * Prevent returning a fractional part of 1.0 for very small negative values of
1558 * 'a' by clamping against 0.99999(9).
1559 */
1560static inline LLVMValueRef
1561clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1562{
1563   LLVMValueRef max;
1564
1565   /* this is the largest number smaller than 1.0 representable as float */
1566   max = lp_build_const_vec(bld->gallivm, bld->type,
1567                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1568   return lp_build_min(bld, fract, max);
1569}
1570
1571
1572/**
1573 * Same as lp_build_fract, but guarantees that the result is always smaller
1574 * than one.
1575 */
1576LLVMValueRef
1577lp_build_fract_safe(struct lp_build_context *bld,
1578                    LLVMValueRef a)
1579{
1580   return clamp_fract(bld, lp_build_fract(bld, a));
1581}
1582
1583
1584/**
1585 * Return the integer part of a float (vector) value (== round toward zero).
1586 * The returned value is an integer (vector).
1587 * Ex: itrunc(-1.5) = -1
1588 */
1589LLVMValueRef
1590lp_build_itrunc(struct lp_build_context *bld,
1591                LLVMValueRef a)
1592{
1593   LLVMBuilderRef builder = bld->gallivm->builder;
1594   const struct lp_type type = bld->type;
1595   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1596
1597   assert(type.floating);
1598   assert(lp_check_value(type, a));
1599
1600   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1601}
1602
1603
1604/**
1605 * Return float (vector) rounded to nearest integer (vector).  The returned
1606 * value is an integer (vector).
1607 * Ex: iround(0.9) = 1
1608 * Ex: iround(-1.5) = -2
1609 */
1610LLVMValueRef
1611lp_build_iround(struct lp_build_context *bld,
1612                LLVMValueRef a)
1613{
1614   LLVMBuilderRef builder = bld->gallivm->builder;
1615   const struct lp_type type = bld->type;
1616   LLVMTypeRef int_vec_type = bld->int_vec_type;
1617   LLVMValueRef res;
1618
1619   assert(type.floating);
1620
1621   assert(lp_check_value(type, a));
1622
1623   if ((util_cpu_caps.has_sse2 &&
1624       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1625       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1626      return lp_build_iround_nearest_sse2(bld, a);
1627   }
1628   if (sse41_rounding_available(type)) {
1629      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1630   }
1631   else {
1632      LLVMValueRef half;
1633
1634      half = lp_build_const_vec(bld->gallivm, type, 0.5);
1635
1636      if (type.sign) {
1637         LLVMTypeRef vec_type = bld->vec_type;
1638         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1639                                    (unsigned long long)1 << (type.width - 1));
1640         LLVMValueRef sign;
1641
1642         /* get sign bit */
1643         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1644         sign = LLVMBuildAnd(builder, sign, mask, "");
1645
1646         /* sign * 0.5 */
1647         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1648         half = LLVMBuildOr(builder, sign, half, "");
1649         half = LLVMBuildBitCast(builder, half, vec_type, "");
1650      }
1651
1652      res = LLVMBuildFAdd(builder, a, half, "");
1653   }
1654
1655   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1656
1657   return res;
1658}
1659
1660
1661/**
1662 * Return floor of float (vector), result is an int (vector)
1663 * Ex: ifloor(1.1) = 1.0
1664 * Ex: ifloor(-1.1) = -2.0
1665 */
1666LLVMValueRef
1667lp_build_ifloor(struct lp_build_context *bld,
1668                LLVMValueRef a)
1669{
1670   LLVMBuilderRef builder = bld->gallivm->builder;
1671   const struct lp_type type = bld->type;
1672   LLVMTypeRef int_vec_type = bld->int_vec_type;
1673   LLVMValueRef res;
1674
1675   assert(type.floating);
1676   assert(lp_check_value(type, a));
1677
1678   res = a;
1679   if (type.sign) {
1680      if (sse41_rounding_available(type)) {
1681         res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1682      }
1683      else {
1684         /* Take the sign bit and add it to 1 constant */
1685         LLVMTypeRef vec_type = bld->vec_type;
1686         unsigned mantissa = lp_mantissa(type);
1687         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1688                                  (unsigned long long)1 << (type.width - 1));
1689         LLVMValueRef sign;
1690         LLVMValueRef offset;
1691
1692         /* sign = a < 0 ? ~0 : 0 */
1693         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1694         sign = LLVMBuildAnd(builder, sign, mask, "");
1695         sign = LLVMBuildAShr(builder, sign,
1696                              lp_build_const_int_vec(bld->gallivm, type,
1697                                                     type.width - 1),
1698                              "ifloor.sign");
1699
1700         /* offset = -0.99999(9)f */
1701         offset = lp_build_const_vec(bld->gallivm, type,
1702                                     -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1703         offset = LLVMConstBitCast(offset, int_vec_type);
1704
1705         /* offset = a < 0 ? offset : 0.0f */
1706         offset = LLVMBuildAnd(builder, offset, sign, "");
1707         offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1708
1709         res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1710      }
1711   }
1712
1713   /* round to nearest (toward zero) */
1714   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1715
1716   return res;
1717}
1718
1719
1720/**
1721 * Return ceiling of float (vector), returning int (vector).
1722 * Ex: iceil( 1.1) = 2
1723 * Ex: iceil(-1.1) = -1
1724 */
1725LLVMValueRef
1726lp_build_iceil(struct lp_build_context *bld,
1727               LLVMValueRef a)
1728{
1729   LLVMBuilderRef builder = bld->gallivm->builder;
1730   const struct lp_type type = bld->type;
1731   LLVMTypeRef int_vec_type = bld->int_vec_type;
1732   LLVMValueRef res;
1733
1734   assert(type.floating);
1735   assert(lp_check_value(type, a));
1736
1737   if (sse41_rounding_available(type)) {
1738      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1739   }
1740   else {
1741      LLVMTypeRef vec_type = bld->vec_type;
1742      unsigned mantissa = lp_mantissa(type);
1743      LLVMValueRef offset;
1744
1745      /* offset = 0.99999(9)f */
1746      offset = lp_build_const_vec(bld->gallivm, type,
1747                                  (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1748
1749      if (type.sign) {
1750         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1751                                (unsigned long long)1 << (type.width - 1));
1752         LLVMValueRef sign;
1753
1754         /* sign = a < 0 ? 0 : ~0 */
1755         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1756         sign = LLVMBuildAnd(builder, sign, mask, "");
1757         sign = LLVMBuildAShr(builder, sign,
1758                              lp_build_const_int_vec(bld->gallivm, type,
1759                                                     type.width - 1),
1760                              "iceil.sign");
1761         sign = LLVMBuildNot(builder, sign, "iceil.not");
1762
1763         /* offset = a < 0 ? 0.0 : offset */
1764         offset = LLVMConstBitCast(offset, int_vec_type);
1765         offset = LLVMBuildAnd(builder, offset, sign, "");
1766         offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1767      }
1768
1769      res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1770   }
1771
1772   /* round to nearest (toward zero) */
1773   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1774
1775   return res;
1776}
1777
1778
1779/**
1780 * Combined ifloor() & fract().
1781 *
1782 * Preferred to calling the functions separately, as it will ensure that the
1783 * strategy (floor() vs ifloor()) that results in less redundant work is used.
1784 */
1785void
1786lp_build_ifloor_fract(struct lp_build_context *bld,
1787                      LLVMValueRef a,
1788                      LLVMValueRef *out_ipart,
1789                      LLVMValueRef *out_fpart)
1790{
1791   LLVMBuilderRef builder = bld->gallivm->builder;
1792   const struct lp_type type = bld->type;
1793   LLVMValueRef ipart;
1794
1795   assert(type.floating);
1796   assert(lp_check_value(type, a));
1797
1798   if (sse41_rounding_available(type)) {
1799      /*
1800       * floor() is easier.
1801       */
1802
1803      ipart = lp_build_floor(bld, a);
1804      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1805      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1806   }
1807   else {
1808      /*
1809       * ifloor() is easier.
1810       */
1811
1812      *out_ipart = lp_build_ifloor(bld, a);
1813      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1814      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1815   }
1816}
1817
1818
1819/**
1820 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1821 * always smaller than one.
1822 */
1823void
1824lp_build_ifloor_fract_safe(struct lp_build_context *bld,
1825                           LLVMValueRef a,
1826                           LLVMValueRef *out_ipart,
1827                           LLVMValueRef *out_fpart)
1828{
1829   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
1830   *out_fpart = clamp_fract(bld, *out_fpart);
1831}
1832
1833
1834LLVMValueRef
1835lp_build_sqrt(struct lp_build_context *bld,
1836              LLVMValueRef a)
1837{
1838   LLVMBuilderRef builder = bld->gallivm->builder;
1839   const struct lp_type type = bld->type;
1840   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1841   char intrinsic[32];
1842
1843   assert(lp_check_value(type, a));
1844
1845   /* TODO: optimize the constant case */
1846
1847   assert(type.floating);
1848   if (type.length == 1) {
1849      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
1850   }
1851   else {
1852      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1853   }
1854
1855   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1856}
1857
1858
1859/**
1860 * Do one Newton-Raphson step to improve reciprocate precision:
1861 *
1862 *   x_{i+1} = x_i * (2 - a * x_i)
1863 *
1864 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1865 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1866 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1867 * halo. It would be necessary to clamp the argument to prevent this.
1868 *
1869 * See also:
1870 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1871 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1872 */
1873static INLINE LLVMValueRef
1874lp_build_rcp_refine(struct lp_build_context *bld,
1875                    LLVMValueRef a,
1876                    LLVMValueRef rcp_a)
1877{
1878   LLVMBuilderRef builder = bld->gallivm->builder;
1879   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1880   LLVMValueRef res;
1881
1882   res = LLVMBuildFMul(builder, a, rcp_a, "");
1883   res = LLVMBuildFSub(builder, two, res, "");
1884   res = LLVMBuildFMul(builder, rcp_a, res, "");
1885
1886   return res;
1887}
1888
1889
1890LLVMValueRef
1891lp_build_rcp(struct lp_build_context *bld,
1892             LLVMValueRef a)
1893{
1894   LLVMBuilderRef builder = bld->gallivm->builder;
1895   const struct lp_type type = bld->type;
1896
1897   assert(lp_check_value(type, a));
1898
1899   if(a == bld->zero)
1900      return bld->undef;
1901   if(a == bld->one)
1902      return bld->one;
1903   if(a == bld->undef)
1904      return bld->undef;
1905
1906   assert(type.floating);
1907
1908   if(LLVMIsConstant(a))
1909      return LLVMConstFDiv(bld->one, a);
1910
1911   /*
1912    * We don't use RCPPS because:
1913    * - it only has 10bits of precision
1914    * - it doesn't even get the reciprocate of 1.0 exactly
1915    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1916    * - for recent processors the benefit over DIVPS is marginal, a case
1917    *   dependent
1918    *
1919    * We could still use it on certain processors if benchmarks show that the
1920    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1921    * particular uses that require less workarounds.
1922    */
1923
1924   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1925         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
1926      const unsigned num_iterations = 0;
1927      LLVMValueRef res;
1928      unsigned i;
1929      const char *intrinsic = NULL;
1930
1931      if (type.length == 4) {
1932         intrinsic = "llvm.x86.sse.rcp.ps";
1933      }
1934      else {
1935         intrinsic = "llvm.x86.avx.rcp.ps.256";
1936      }
1937
1938      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1939
1940      for (i = 0; i < num_iterations; ++i) {
1941         res = lp_build_rcp_refine(bld, a, res);
1942      }
1943
1944      return res;
1945   }
1946
1947   return LLVMBuildFDiv(builder, bld->one, a, "");
1948}
1949
1950
1951/**
1952 * Do one Newton-Raphson step to improve rsqrt precision:
1953 *
1954 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1955 *
1956 * See also:
1957 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1958 */
1959static INLINE LLVMValueRef
1960lp_build_rsqrt_refine(struct lp_build_context *bld,
1961                      LLVMValueRef a,
1962                      LLVMValueRef rsqrt_a)
1963{
1964   LLVMBuilderRef builder = bld->gallivm->builder;
1965   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1966   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1967   LLVMValueRef res;
1968
1969   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1970   res = LLVMBuildFMul(builder, a, res, "");
1971   res = LLVMBuildFSub(builder, three, res, "");
1972   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1973   res = LLVMBuildFMul(builder, half, res, "");
1974
1975   return res;
1976}
1977
1978
1979/**
1980 * Generate 1/sqrt(a)
1981 */
1982LLVMValueRef
1983lp_build_rsqrt(struct lp_build_context *bld,
1984               LLVMValueRef a)
1985{
1986   LLVMBuilderRef builder = bld->gallivm->builder;
1987   const struct lp_type type = bld->type;
1988
1989   assert(lp_check_value(type, a));
1990
1991   assert(type.floating);
1992
1993   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1994        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1995      const unsigned num_iterations = 1;
1996      LLVMValueRef res;
1997      unsigned i;
1998      const char *intrinsic = NULL;
1999
2000      if (type.length == 4) {
2001         intrinsic = "llvm.x86.sse.rsqrt.ps";
2002      }
2003      else {
2004         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2005      }
2006
2007      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2008
2009
2010      for (i = 0; i < num_iterations; ++i) {
2011         res = lp_build_rsqrt_refine(bld, a, res);
2012      }
2013
2014      return res;
2015   }
2016
2017   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2018}
2019
2020
2021/**
2022 * Generate sin(a) using SSE2
2023 */
2024LLVMValueRef
2025lp_build_sin(struct lp_build_context *bld,
2026             LLVMValueRef a)
2027{
2028   struct gallivm_state *gallivm = bld->gallivm;
2029   LLVMBuilderRef builder = gallivm->builder;
2030   struct lp_type int_type = lp_int_type(bld->type);
2031   LLVMBuilderRef b = builder;
2032
2033   /*
2034    *  take the absolute value,
2035    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2036    */
2037
2038   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2039   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2040
2041   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2042   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2043
2044   /*
2045    * extract the sign bit (upper one)
2046    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2047    */
2048   LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2049   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2050
2051   /*
2052    * scale by 4/Pi
2053    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2054    */
2055
2056   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2057   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2058
2059   /*
2060    * store the integer part of y in mm0
2061    * emm2 = _mm_cvttps_epi32(y);
2062    */
2063
2064   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2065
2066   /*
2067    * j=(j+1) & (~1) (see the cephes sources)
2068    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2069    */
2070
2071   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2072   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2073   /*
2074    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2075    */
2076   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2077   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2078
2079   /*
2080    * y = _mm_cvtepi32_ps(emm2);
2081    */
2082   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2083
2084   /* get the swap sign flag
2085    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2086    */
2087   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2088   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2089
2090   /*
2091    * emm2 = _mm_slli_epi32(emm0, 29);
2092    */
2093   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2094   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2095
2096   /*
2097    * get the polynom selection mask
2098    * there is one polynom for 0 <= x <= Pi/4
2099    * and another one for Pi/4<x<=Pi/2
2100    * Both branches will be computed.
2101    *
2102    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2103    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2104    */
2105
2106   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2107   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2108   LLVMValueRef poly_mask = lp_build_compare(gallivm,
2109                                             int_type, PIPE_FUNC_EQUAL,
2110                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2111   /*
2112    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2113    */
2114   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2115
2116   /*
2117    * _PS_CONST(minus_cephes_DP1, -0.78515625);
2118    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2119    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2120    */
2121   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2122   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2123   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2124
2125   /*
2126    * The magic pass: "Extended precision modular arithmetic"
2127    * x = ((x - y * DP1) - y * DP2) - y * DP3;
2128    * xmm1 = _mm_mul_ps(y, xmm1);
2129    * xmm2 = _mm_mul_ps(y, xmm2);
2130    * xmm3 = _mm_mul_ps(y, xmm3);
2131    */
2132   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2133   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2134   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2135
2136   /*
2137    * x = _mm_add_ps(x, xmm1);
2138    * x = _mm_add_ps(x, xmm2);
2139    * x = _mm_add_ps(x, xmm3);
2140    */
2141
2142   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2143   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2144   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2145
2146   /*
2147    * Evaluate the first polynom  (0 <= x <= Pi/4)
2148    *
2149    * z = _mm_mul_ps(x,x);
2150    */
2151   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2152
2153   /*
2154    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2155    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2156    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2157    */
2158   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2159   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2160   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2161
2162   /*
2163    * y = *(v4sf*)_ps_coscof_p0;
2164    * y = _mm_mul_ps(y, z);
2165    */
2166   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2167   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2168   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2169   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2170   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2171   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2172
2173
2174   /*
2175    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2176    * y = _mm_sub_ps(y, tmp);
2177    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2178    */
2179   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2180   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2181   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2182   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2183   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2184
2185   /*
2186    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2187    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2188    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2189    */
2190   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2191   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2192   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2193
2194   /*
2195    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2196    *
2197    * y2 = *(v4sf*)_ps_sincof_p0;
2198    * y2 = _mm_mul_ps(y2, z);
2199    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2200    * y2 = _mm_mul_ps(y2, z);
2201    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2202    * y2 = _mm_mul_ps(y2, z);
2203    * y2 = _mm_mul_ps(y2, x);
2204    * y2 = _mm_add_ps(y2, x);
2205    */
2206
2207   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2208   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2209   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2210   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2211   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2212   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2213   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2214
2215   /*
2216    * select the correct result from the two polynoms
2217    * xmm3 = poly_mask;
2218    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2219    * y = _mm_andnot_ps(xmm3, y);
2220    * y = _mm_add_ps(y,y2);
2221    */
2222   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2223   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2224   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2225   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2226   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2227   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2228   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2229
2230   /*
2231    * update the sign
2232    * y = _mm_xor_ps(y, sign_bit);
2233    */
2234   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2235   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2236   return y_result;
2237}
2238
2239
2240/**
2241 * Generate cos(a) using SSE2
2242 */
2243LLVMValueRef
2244lp_build_cos(struct lp_build_context *bld,
2245             LLVMValueRef a)
2246{
2247   struct gallivm_state *gallivm = bld->gallivm;
2248   LLVMBuilderRef builder = gallivm->builder;
2249   struct lp_type int_type = lp_int_type(bld->type);
2250   LLVMBuilderRef b = builder;
2251
2252   /*
2253    *  take the absolute value,
2254    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2255    */
2256
2257   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2258   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2259
2260   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2261   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2262
2263   /*
2264    * scale by 4/Pi
2265    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2266    */
2267
2268   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2269   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2270
2271   /*
2272    * store the integer part of y in mm0
2273    * emm2 = _mm_cvttps_epi32(y);
2274    */
2275
2276   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2277
2278   /*
2279    * j=(j+1) & (~1) (see the cephes sources)
2280    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2281    */
2282
2283   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2284   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2285   /*
2286    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2287    */
2288   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2289   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2290
2291   /*
2292    * y = _mm_cvtepi32_ps(emm2);
2293    */
2294   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2295
2296
2297   /*
2298    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2299    */
2300   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2301   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2302
2303
2304   /* get the swap sign flag
2305    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2306    */
2307   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2308   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2309   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2310   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2311
2312   /*
2313    * emm2 = _mm_slli_epi32(emm0, 29);
2314    */
2315   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2316   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2317
2318   /*
2319    * get the polynom selection mask
2320    * there is one polynom for 0 <= x <= Pi/4
2321    * and another one for Pi/4<x<=Pi/2
2322    * Both branches will be computed.
2323    *
2324    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2325    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2326    */
2327
2328   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2329   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2330   LLVMValueRef poly_mask = lp_build_compare(gallivm,
2331                                             int_type, PIPE_FUNC_EQUAL,
2332   				             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2333
2334   /*
2335    * _PS_CONST(minus_cephes_DP1, -0.78515625);
2336    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2337    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2338    */
2339   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2340   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2341   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2342
2343   /*
2344    * The magic pass: "Extended precision modular arithmetic"
2345    * x = ((x - y * DP1) - y * DP2) - y * DP3;
2346    * xmm1 = _mm_mul_ps(y, xmm1);
2347    * xmm2 = _mm_mul_ps(y, xmm2);
2348    * xmm3 = _mm_mul_ps(y, xmm3);
2349    */
2350   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2351   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2352   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2353
2354   /*
2355    * x = _mm_add_ps(x, xmm1);
2356    * x = _mm_add_ps(x, xmm2);
2357    * x = _mm_add_ps(x, xmm3);
2358    */
2359
2360   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2361   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2362   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2363
2364   /*
2365    * Evaluate the first polynom  (0 <= x <= Pi/4)
2366    *
2367    * z = _mm_mul_ps(x,x);
2368    */
2369   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2370
2371   /*
2372    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2373    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2374    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2375    */
2376   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2377   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2378   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2379
2380   /*
2381    * y = *(v4sf*)_ps_coscof_p0;
2382    * y = _mm_mul_ps(y, z);
2383    */
2384   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2385   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2386   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2387   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2388   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2389   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2390
2391
2392   /*
2393    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2394    * y = _mm_sub_ps(y, tmp);
2395    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2396    */
2397   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2398   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2399   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2400   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2401   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2402
2403   /*
2404    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2405    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2406    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2407    */
2408   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2409   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2410   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2411
2412   /*
2413    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2414    *
2415    * y2 = *(v4sf*)_ps_sincof_p0;
2416    * y2 = _mm_mul_ps(y2, z);
2417    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2418    * y2 = _mm_mul_ps(y2, z);
2419    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2420    * y2 = _mm_mul_ps(y2, z);
2421    * y2 = _mm_mul_ps(y2, x);
2422    * y2 = _mm_add_ps(y2, x);
2423    */
2424
2425   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2426   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2427   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2428   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2429   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2430   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2431   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2432
2433   /*
2434    * select the correct result from the two polynoms
2435    * xmm3 = poly_mask;
2436    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2437    * y = _mm_andnot_ps(xmm3, y);
2438    * y = _mm_add_ps(y,y2);
2439    */
2440   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2441   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2442   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2443   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2444   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2445   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2446
2447   /*
2448    * update the sign
2449    * y = _mm_xor_ps(y, sign_bit);
2450    */
2451   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2452   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2453   return y_result;
2454}
2455
2456
2457/**
2458 * Generate pow(x, y)
2459 */
2460LLVMValueRef
2461lp_build_pow(struct lp_build_context *bld,
2462             LLVMValueRef x,
2463             LLVMValueRef y)
2464{
2465   /* TODO: optimize the constant case */
2466   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2467       LLVMIsConstant(x) && LLVMIsConstant(y)) {
2468      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2469                   __FUNCTION__);
2470   }
2471
2472   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2473}
2474
2475
2476/**
2477 * Generate exp(x)
2478 */
2479LLVMValueRef
2480lp_build_exp(struct lp_build_context *bld,
2481             LLVMValueRef x)
2482{
2483   /* log2(e) = 1/log(2) */
2484   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2485                                           1.4426950408889634);
2486
2487   assert(lp_check_value(bld->type, x));
2488
2489   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2490}
2491
2492
2493/**
2494 * Generate log(x)
2495 */
2496LLVMValueRef
2497lp_build_log(struct lp_build_context *bld,
2498             LLVMValueRef x)
2499{
2500   /* log(2) */
2501   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2502                                          0.69314718055994529);
2503
2504   assert(lp_check_value(bld->type, x));
2505
2506   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2507}
2508
2509
2510/**
2511 * Generate polynomial.
2512 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2513 */
2514static LLVMValueRef
2515lp_build_polynomial(struct lp_build_context *bld,
2516                    LLVMValueRef x,
2517                    const double *coeffs,
2518                    unsigned num_coeffs)
2519{
2520   const struct lp_type type = bld->type;
2521   LLVMValueRef even = NULL, odd = NULL;
2522   LLVMValueRef x2;
2523   unsigned i;
2524
2525   assert(lp_check_value(bld->type, x));
2526
2527   /* TODO: optimize the constant case */
2528   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2529       LLVMIsConstant(x)) {
2530      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2531                   __FUNCTION__);
2532   }
2533
2534   /*
2535    * Calculate odd and even terms seperately to decrease data dependency
2536    * Ex:
2537    *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2538    *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2539    */
2540   x2 = lp_build_mul(bld, x, x);
2541
2542   for (i = num_coeffs; i--; ) {
2543      LLVMValueRef coeff;
2544
2545      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2546
2547      if (i % 2 == 0) {
2548         if (even)
2549            even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2550         else
2551            even = coeff;
2552      } else {
2553         if (odd)
2554            odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2555         else
2556            odd = coeff;
2557      }
2558   }
2559
2560   if (odd)
2561      return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2562   else if (even)
2563      return even;
2564   else
2565      return bld->undef;
2566}
2567
2568
2569/**
2570 * Minimax polynomial fit of 2**x, in range [0, 1[
2571 */
2572const double lp_build_exp2_polynomial[] = {
2573#if EXP_POLY_DEGREE == 5
2574   0.999999925063526176901,
2575   0.693153073200168932794,
2576   0.240153617044375388211,
2577   0.0558263180532956664775,
2578   0.00898934009049466391101,
2579   0.00187757667519147912699
2580#elif EXP_POLY_DEGREE == 4
2581   1.00000259337069434683,
2582   0.693003834469974940458,
2583   0.24144275689150793076,
2584   0.0520114606103070150235,
2585   0.0135341679161270268764
2586#elif EXP_POLY_DEGREE == 3
2587   0.999925218562710312959,
2588   0.695833540494823811697,
2589   0.226067155427249155588,
2590   0.0780245226406372992967
2591#elif EXP_POLY_DEGREE == 2
2592   1.00172476321474503578,
2593   0.657636275736077639316,
2594   0.33718943461968720704
2595#else
2596#error
2597#endif
2598};
2599
2600
2601void
2602lp_build_exp2_approx(struct lp_build_context *bld,
2603                     LLVMValueRef x,
2604                     LLVMValueRef *p_exp2_int_part,
2605                     LLVMValueRef *p_frac_part,
2606                     LLVMValueRef *p_exp2)
2607{
2608   LLVMBuilderRef builder = bld->gallivm->builder;
2609   const struct lp_type type = bld->type;
2610   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2611   LLVMValueRef ipart = NULL;
2612   LLVMValueRef fpart = NULL;
2613   LLVMValueRef expipart = NULL;
2614   LLVMValueRef expfpart = NULL;
2615   LLVMValueRef res = NULL;
2616
2617   assert(lp_check_value(bld->type, x));
2618
2619   if(p_exp2_int_part || p_frac_part || p_exp2) {
2620      /* TODO: optimize the constant case */
2621      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2622          LLVMIsConstant(x)) {
2623         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2624                      __FUNCTION__);
2625      }
2626
2627      assert(type.floating && type.width == 32);
2628
2629      x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2630      x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2631
2632      /* ipart = floor(x) */
2633      /* fpart = x - ipart */
2634      lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2635   }
2636
2637   if(p_exp2_int_part || p_exp2) {
2638      /* expipart = (float) (1 << ipart) */
2639      expipart = LLVMBuildAdd(builder, ipart,
2640                              lp_build_const_int_vec(bld->gallivm, type, 127), "");
2641      expipart = LLVMBuildShl(builder, expipart,
2642                              lp_build_const_int_vec(bld->gallivm, type, 23), "");
2643      expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2644   }
2645
2646   if(p_exp2) {
2647      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2648                                     Elements(lp_build_exp2_polynomial));
2649
2650      res = LLVMBuildFMul(builder, expipart, expfpart, "");
2651   }
2652
2653   if(p_exp2_int_part)
2654      *p_exp2_int_part = expipart;
2655
2656   if(p_frac_part)
2657      *p_frac_part = fpart;
2658
2659   if(p_exp2)
2660      *p_exp2 = res;
2661}
2662
2663
2664LLVMValueRef
2665lp_build_exp2(struct lp_build_context *bld,
2666              LLVMValueRef x)
2667{
2668   LLVMValueRef res;
2669   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2670   return res;
2671}
2672
2673
2674/**
2675 * Extract the exponent of a IEEE-754 floating point value.
2676 *
2677 * Optionally apply an integer bias.
2678 *
2679 * Result is an integer value with
2680 *
2681 *   ifloor(log2(x)) + bias
2682 */
2683LLVMValueRef
2684lp_build_extract_exponent(struct lp_build_context *bld,
2685                          LLVMValueRef x,
2686                          int bias)
2687{
2688   LLVMBuilderRef builder = bld->gallivm->builder;
2689   const struct lp_type type = bld->type;
2690   unsigned mantissa = lp_mantissa(type);
2691   LLVMValueRef res;
2692
2693   assert(type.floating);
2694
2695   assert(lp_check_value(bld->type, x));
2696
2697   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2698
2699   res = LLVMBuildLShr(builder, x,
2700                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2701   res = LLVMBuildAnd(builder, res,
2702                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
2703   res = LLVMBuildSub(builder, res,
2704                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2705
2706   return res;
2707}
2708
2709
2710/**
2711 * Extract the mantissa of the a floating.
2712 *
2713 * Result is a floating point value with
2714 *
2715 *   x / floor(log2(x))
2716 */
2717LLVMValueRef
2718lp_build_extract_mantissa(struct lp_build_context *bld,
2719                          LLVMValueRef x)
2720{
2721   LLVMBuilderRef builder = bld->gallivm->builder;
2722   const struct lp_type type = bld->type;
2723   unsigned mantissa = lp_mantissa(type);
2724   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2725                                                  (1ULL << mantissa) - 1);
2726   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2727   LLVMValueRef res;
2728
2729   assert(lp_check_value(bld->type, x));
2730
2731   assert(type.floating);
2732
2733   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2734
2735   /* res = x / 2**ipart */
2736   res = LLVMBuildAnd(builder, x, mantmask, "");
2737   res = LLVMBuildOr(builder, res, one, "");
2738   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2739
2740   return res;
2741}
2742
2743
2744
2745/**
2746 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2747 * These coefficients can be generate with
2748 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2749 */
2750const double lp_build_log2_polynomial[] = {
2751#if LOG_POLY_DEGREE == 5
2752   2.88539008148777786488L,
2753   0.961796878841293367824L,
2754   0.577058946784739859012L,
2755   0.412914355135828735411L,
2756   0.308591899232910175289L,
2757   0.352376952300281371868L,
2758#elif LOG_POLY_DEGREE == 4
2759   2.88539009343309178325L,
2760   0.961791550404184197881L,
2761   0.577440339438736392009L,
2762   0.403343858251329912514L,
2763   0.406718052498846252698L,
2764#elif LOG_POLY_DEGREE == 3
2765   2.88538959748872753838L,
2766   0.961932915889597772928L,
2767   0.571118517972136195241L,
2768   0.493997535084709500285L,
2769#else
2770#error
2771#endif
2772};
2773
2774/**
2775 * See http://www.devmaster.net/forums/showthread.php?p=43580
2776 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2777 * http://www.nezumi.demon.co.uk/consult/logx.htm
2778 */
2779void
2780lp_build_log2_approx(struct lp_build_context *bld,
2781                     LLVMValueRef x,
2782                     LLVMValueRef *p_exp,
2783                     LLVMValueRef *p_floor_log2,
2784                     LLVMValueRef *p_log2)
2785{
2786   LLVMBuilderRef builder = bld->gallivm->builder;
2787   const struct lp_type type = bld->type;
2788   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2789   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2790
2791   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2792   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2793   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2794
2795   LLVMValueRef i = NULL;
2796   LLVMValueRef y = NULL;
2797   LLVMValueRef z = NULL;
2798   LLVMValueRef exp = NULL;
2799   LLVMValueRef mant = NULL;
2800   LLVMValueRef logexp = NULL;
2801   LLVMValueRef logmant = NULL;
2802   LLVMValueRef res = NULL;
2803
2804   assert(lp_check_value(bld->type, x));
2805
2806   if(p_exp || p_floor_log2 || p_log2) {
2807      /* TODO: optimize the constant case */
2808      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2809          LLVMIsConstant(x)) {
2810         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2811                      __FUNCTION__);
2812      }
2813
2814      assert(type.floating && type.width == 32);
2815
2816      /*
2817       * We don't explicitly handle denormalized numbers. They will yield a
2818       * result in the neighbourhood of -127, which appears to be adequate
2819       * enough.
2820       */
2821
2822      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2823
2824      /* exp = (float) exponent(x) */
2825      exp = LLVMBuildAnd(builder, i, expmask, "");
2826   }
2827
2828   if(p_floor_log2 || p_log2) {
2829      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2830      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2831      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2832   }
2833
2834   if(p_log2) {
2835      /* mant = 1 + (float) mantissa(x) */
2836      mant = LLVMBuildAnd(builder, i, mantmask, "");
2837      mant = LLVMBuildOr(builder, mant, one, "");
2838      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2839
2840      /* y = (mant - 1) / (mant + 1) */
2841      y = lp_build_div(bld,
2842         lp_build_sub(bld, mant, bld->one),
2843         lp_build_add(bld, mant, bld->one)
2844      );
2845
2846      /* z = y^2 */
2847      z = lp_build_mul(bld, y, y);
2848
2849      /* compute P(z) */
2850      logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
2851                                    Elements(lp_build_log2_polynomial));
2852
2853      /* logmant = y * P(z) */
2854      logmant = lp_build_mul(bld, y, logmant);
2855
2856      res = lp_build_add(bld, logmant, logexp);
2857   }
2858
2859   if(p_exp) {
2860      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2861      *p_exp = exp;
2862   }
2863
2864   if(p_floor_log2)
2865      *p_floor_log2 = logexp;
2866
2867   if(p_log2)
2868      *p_log2 = res;
2869}
2870
2871
2872LLVMValueRef
2873lp_build_log2(struct lp_build_context *bld,
2874              LLVMValueRef x)
2875{
2876   LLVMValueRef res;
2877   lp_build_log2_approx(bld, x, NULL, NULL, &res);
2878   return res;
2879}
2880
2881
2882/**
2883 * Faster (and less accurate) log2.
2884 *
2885 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2886 *
2887 * Piece-wise linear approximation, with exact results when x is a
2888 * power of two.
2889 *
2890 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2891 */
2892LLVMValueRef
2893lp_build_fast_log2(struct lp_build_context *bld,
2894                   LLVMValueRef x)
2895{
2896   LLVMBuilderRef builder = bld->gallivm->builder;
2897   LLVMValueRef ipart;
2898   LLVMValueRef fpart;
2899
2900   assert(lp_check_value(bld->type, x));
2901
2902   assert(bld->type.floating);
2903
2904   /* ipart = floor(log2(x)) - 1 */
2905   ipart = lp_build_extract_exponent(bld, x, -1);
2906   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2907
2908   /* fpart = x / 2**ipart */
2909   fpart = lp_build_extract_mantissa(bld, x);
2910
2911   /* ipart + fpart */
2912   return LLVMBuildFAdd(builder, ipart, fpart, "");
2913}
2914
2915
2916/**
2917 * Fast implementation of iround(log2(x)).
2918 *
2919 * Not an approximation -- it should give accurate results all the time.
2920 */
2921LLVMValueRef
2922lp_build_ilog2(struct lp_build_context *bld,
2923               LLVMValueRef x)
2924{
2925   LLVMBuilderRef builder = bld->gallivm->builder;
2926   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2927   LLVMValueRef ipart;
2928
2929   assert(bld->type.floating);
2930
2931   assert(lp_check_value(bld->type, x));
2932
2933   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2934   x = LLVMBuildFMul(builder, x, sqrt2, "");
2935
2936   /* ipart = floor(log2(x) + 0.5)  */
2937   ipart = lp_build_extract_exponent(bld, x, 0);
2938
2939   return ipart;
2940}
2941
2942LLVMValueRef
2943lp_build_mod(struct lp_build_context *bld,
2944             LLVMValueRef x,
2945             LLVMValueRef y)
2946{
2947   LLVMBuilderRef builder = bld->gallivm->builder;
2948   LLVMValueRef res;
2949   const struct lp_type type = bld->type;
2950
2951   assert(lp_check_value(type, x));
2952   assert(lp_check_value(type, y));
2953
2954   if (type.floating)
2955      res = LLVMBuildFRem(builder, x, y, "");
2956   else if (type.sign)
2957      res = LLVMBuildSRem(builder, x, y, "");
2958   else
2959      res = LLVMBuildURem(builder, x, y, "");
2960   return res;
2961}
2962