lp_bld_arit.c revision 6299f241e9fdd86e705d144a42d9b1979c13f9ad
1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_init.h"
57#include "lp_bld_intr.h"
58#include "lp_bld_logic.h"
59#include "lp_bld_pack.h"
60#include "lp_bld_debug.h"
61#include "lp_bld_arit.h"
62
63
64#define EXP_POLY_DEGREE 3
65
66#define LOG_POLY_DEGREE 5
67
68
69/**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73static LLVMValueRef
74lp_build_min_simple(struct lp_build_context *bld,
75                    LLVMValueRef a,
76                    LLVMValueRef b)
77{
78   LLVMBuilderRef builder = bld->gallivm->builder;
79   const struct lp_type type = bld->type;
80   const char *intrinsic = NULL;
81   LLVMValueRef cond;
82
83   assert(lp_check_value(type, a));
84   assert(lp_check_value(type, b));
85
86   /* TODO: optimize the constant case */
87
88   if(type.width * type.length == 128) {
89      if(type.floating) {
90         if(type.width == 32 && util_cpu_caps.has_sse)
91            intrinsic = "llvm.x86.sse.min.ps";
92         if(type.width == 64 && util_cpu_caps.has_sse2)
93            intrinsic = "llvm.x86.sse2.min.pd";
94      }
95      else {
96         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
97            intrinsic = "llvm.x86.sse2.pminu.b";
98         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
99            intrinsic = "llvm.x86.sse41.pminsb";
100         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
101            intrinsic = "llvm.x86.sse41.pminuw";
102         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
103            intrinsic = "llvm.x86.sse2.pmins.w";
104         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
105            intrinsic = "llvm.x86.sse41.pminud";
106         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
107            intrinsic = "llvm.x86.sse41.pminsd";
108      }
109   }
110
111   if(intrinsic)
112      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
113
114   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
115   return lp_build_select(bld, cond, a, b);
116}
117
118
119/**
120 * Generate max(a, b)
121 * No checks for special case values of a or b = 1 or 0 are done.
122 */
123static LLVMValueRef
124lp_build_max_simple(struct lp_build_context *bld,
125                    LLVMValueRef a,
126                    LLVMValueRef b)
127{
128   LLVMBuilderRef builder = bld->gallivm->builder;
129   const struct lp_type type = bld->type;
130   const char *intrinsic = NULL;
131   LLVMValueRef cond;
132
133   assert(lp_check_value(type, a));
134   assert(lp_check_value(type, b));
135
136   /* TODO: optimize the constant case */
137
138   if(type.width * type.length == 128) {
139      if(type.floating) {
140         if(type.width == 32 && util_cpu_caps.has_sse)
141            intrinsic = "llvm.x86.sse.max.ps";
142         if(type.width == 64 && util_cpu_caps.has_sse2)
143            intrinsic = "llvm.x86.sse2.max.pd";
144      }
145      else {
146         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
147            intrinsic = "llvm.x86.sse2.pmaxu.b";
148         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
149            intrinsic = "llvm.x86.sse41.pmaxsb";
150         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
151            intrinsic = "llvm.x86.sse41.pmaxuw";
152         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
153            intrinsic = "llvm.x86.sse2.pmaxs.w";
154         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
155            intrinsic = "llvm.x86.sse41.pmaxud";
156         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
157            intrinsic = "llvm.x86.sse41.pmaxsd";
158      }
159   }
160
161   if(intrinsic)
162      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
163
164   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
165   return lp_build_select(bld, cond, a, b);
166}
167
168
169/**
170 * Generate 1 - a, or ~a depending on bld->type.
171 */
172LLVMValueRef
173lp_build_comp(struct lp_build_context *bld,
174              LLVMValueRef a)
175{
176   LLVMBuilderRef builder = bld->gallivm->builder;
177   const struct lp_type type = bld->type;
178
179   assert(lp_check_value(type, a));
180
181   if(a == bld->one)
182      return bld->zero;
183   if(a == bld->zero)
184      return bld->one;
185
186   if(type.norm && !type.floating && !type.fixed && !type.sign) {
187      if(LLVMIsConstant(a))
188         return LLVMConstNot(a);
189      else
190         return LLVMBuildNot(builder, a, "");
191   }
192
193   if(LLVMIsConstant(a))
194      if (type.floating)
195          return LLVMConstFSub(bld->one, a);
196      else
197          return LLVMConstSub(bld->one, a);
198   else
199      if (type.floating)
200         return LLVMBuildFSub(builder, bld->one, a, "");
201      else
202         return LLVMBuildSub(builder, bld->one, a, "");
203}
204
205
206/**
207 * Generate a + b
208 */
209LLVMValueRef
210lp_build_add(struct lp_build_context *bld,
211             LLVMValueRef a,
212             LLVMValueRef b)
213{
214   LLVMBuilderRef builder = bld->gallivm->builder;
215   const struct lp_type type = bld->type;
216   LLVMValueRef res;
217
218   assert(lp_check_value(type, a));
219   assert(lp_check_value(type, b));
220
221   if(a == bld->zero)
222      return b;
223   if(b == bld->zero)
224      return a;
225   if(a == bld->undef || b == bld->undef)
226      return bld->undef;
227
228   if(bld->type.norm) {
229      const char *intrinsic = NULL;
230
231      if(a == bld->one || b == bld->one)
232        return bld->one;
233
234      if(util_cpu_caps.has_sse2 &&
235         type.width * type.length == 128 &&
236         !type.floating && !type.fixed) {
237         if(type.width == 8)
238            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
239         if(type.width == 16)
240            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
241      }
242
243      if(intrinsic)
244         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
245   }
246
247   if(LLVMIsConstant(a) && LLVMIsConstant(b))
248      if (type.floating)
249         res = LLVMConstFAdd(a, b);
250      else
251         res = LLVMConstAdd(a, b);
252   else
253      if (type.floating)
254         res = LLVMBuildFAdd(builder, a, b, "");
255      else
256         res = LLVMBuildAdd(builder, a, b, "");
257
258   /* clamp to ceiling of 1.0 */
259   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
260      res = lp_build_min_simple(bld, res, bld->one);
261
262   /* XXX clamp to floor of -1 or 0??? */
263
264   return res;
265}
266
267
268/** Return the scalar sum of the elements of a */
269LLVMValueRef
270lp_build_sum_vector(struct lp_build_context *bld,
271                    LLVMValueRef a)
272{
273   LLVMBuilderRef builder = bld->gallivm->builder;
274   const struct lp_type type = bld->type;
275   LLVMValueRef index, res;
276   unsigned i;
277
278   assert(lp_check_value(type, a));
279
280   if (type.length == 1) {
281      return a;
282   }
283
284   assert(!bld->type.norm);
285
286   index = lp_build_const_int32(bld->gallivm, 0);
287   res = LLVMBuildExtractElement(builder, a, index, "");
288
289   for (i = 1; i < type.length; i++) {
290      index = lp_build_const_int32(bld->gallivm, i);
291      if (type.floating)
292         res = LLVMBuildFAdd(builder, res,
293                            LLVMBuildExtractElement(builder,
294                                                    a, index, ""),
295                            "");
296      else
297         res = LLVMBuildAdd(builder, res,
298                            LLVMBuildExtractElement(builder,
299                                                    a, index, ""),
300                            "");
301   }
302
303   return res;
304}
305
306
307/**
308 * Generate a - b
309 */
310LLVMValueRef
311lp_build_sub(struct lp_build_context *bld,
312             LLVMValueRef a,
313             LLVMValueRef b)
314{
315   LLVMBuilderRef builder = bld->gallivm->builder;
316   const struct lp_type type = bld->type;
317   LLVMValueRef res;
318
319   assert(lp_check_value(type, a));
320   assert(lp_check_value(type, b));
321
322   if(b == bld->zero)
323      return a;
324   if(a == bld->undef || b == bld->undef)
325      return bld->undef;
326   if(a == b)
327      return bld->zero;
328
329   if(bld->type.norm) {
330      const char *intrinsic = NULL;
331
332      if(b == bld->one)
333        return bld->zero;
334
335      if(util_cpu_caps.has_sse2 &&
336         type.width * type.length == 128 &&
337         !type.floating && !type.fixed) {
338         if(type.width == 8)
339            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
340         if(type.width == 16)
341            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
342      }
343
344      if(intrinsic)
345         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
346   }
347
348   if(LLVMIsConstant(a) && LLVMIsConstant(b))
349      if (type.floating)
350         res = LLVMConstFSub(a, b);
351      else
352         res = LLVMConstSub(a, b);
353   else
354      if (type.floating)
355         res = LLVMBuildFSub(builder, a, b, "");
356      else
357         res = LLVMBuildSub(builder, a, b, "");
358
359   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
360      res = lp_build_max_simple(bld, res, bld->zero);
361
362   return res;
363}
364
365
366/**
367 * Normalized 8bit multiplication.
368 *
369 * - alpha plus one
370 *
371 *     makes the following approximation to the division (Sree)
372 *
373 *       a*b/255 ~= (a*(b + 1)) >> 256
374 *
375 *     which is the fastest method that satisfies the following OpenGL criteria
376 *
377 *       0*0 = 0 and 255*255 = 255
378 *
379 * - geometric series
380 *
381 *     takes the geometric series approximation to the division
382 *
383 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
384 *
385 *     in this case just the first two terms to fit in 16bit arithmetic
386 *
387 *       t/255 ~= (t + (t >> 8)) >> 8
388 *
389 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
390 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
391 *     must be used
392 *
393 * - geometric series plus rounding
394 *
395 *     when using a geometric series division instead of truncating the result
396 *     use roundoff in the approximation (Jim Blinn)
397 *
398 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
399 *
400 *     achieving the exact results
401 *
402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
403 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
404 * @sa Michael Herf, The "double blend trick", May 2000,
405 *     http://www.stereopsis.com/doubleblend.html
406 */
407static LLVMValueRef
408lp_build_mul_u8n(struct gallivm_state *gallivm,
409                 struct lp_type i16_type,
410                 LLVMValueRef a, LLVMValueRef b)
411{
412   LLVMBuilderRef builder = gallivm->builder;
413   LLVMValueRef c8;
414   LLVMValueRef ab;
415
416   assert(!i16_type.floating);
417   assert(lp_check_value(i16_type, a));
418   assert(lp_check_value(i16_type, b));
419
420   c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
421
422#if 0
423
424   /* a*b/255 ~= (a*(b + 1)) >> 256 */
425   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
426   ab = LLVMBuildMul(builder, a, b, "");
427
428#else
429
430   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
431   ab = LLVMBuildMul(builder, a, b, "");
432   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
433   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
434
435#endif
436
437   ab = LLVMBuildLShr(builder, ab, c8, "");
438
439   return ab;
440}
441
442
443/**
444 * Generate a * b
445 */
446LLVMValueRef
447lp_build_mul(struct lp_build_context *bld,
448             LLVMValueRef a,
449             LLVMValueRef b)
450{
451   LLVMBuilderRef builder = bld->gallivm->builder;
452   const struct lp_type type = bld->type;
453   LLVMValueRef shift;
454   LLVMValueRef res;
455
456   assert(lp_check_value(type, a));
457   assert(lp_check_value(type, b));
458
459   if(a == bld->zero)
460      return bld->zero;
461   if(a == bld->one)
462      return b;
463   if(b == bld->zero)
464      return bld->zero;
465   if(b == bld->one)
466      return a;
467   if(a == bld->undef || b == bld->undef)
468      return bld->undef;
469
470   if(!type.floating && !type.fixed && type.norm) {
471      if(type.width == 8) {
472         struct lp_type i16_type = lp_wider_type(type);
473         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
474
475         lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
476         lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
477
478         /* PMULLW, PSRLW, PADDW */
479         abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
480         abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
481
482         ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
483
484         return ab;
485      }
486
487      /* FIXME */
488      assert(0);
489   }
490
491   if(type.fixed)
492      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
493   else
494      shift = NULL;
495
496   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
497      if (type.floating)
498         res = LLVMConstFMul(a, b);
499      else
500         res = LLVMConstMul(a, b);
501      if(shift) {
502         if(type.sign)
503            res = LLVMConstAShr(res, shift);
504         else
505            res = LLVMConstLShr(res, shift);
506      }
507   }
508   else {
509      if (type.floating)
510         res = LLVMBuildFMul(builder, a, b, "");
511      else
512         res = LLVMBuildMul(builder, a, b, "");
513      if(shift) {
514         if(type.sign)
515            res = LLVMBuildAShr(builder, res, shift, "");
516         else
517            res = LLVMBuildLShr(builder, res, shift, "");
518      }
519   }
520
521   return res;
522}
523
524
525/**
526 * Small vector x scale multiplication optimization.
527 */
528LLVMValueRef
529lp_build_mul_imm(struct lp_build_context *bld,
530                 LLVMValueRef a,
531                 int b)
532{
533   LLVMBuilderRef builder = bld->gallivm->builder;
534   LLVMValueRef factor;
535
536   assert(lp_check_value(bld->type, a));
537
538   if(b == 0)
539      return bld->zero;
540
541   if(b == 1)
542      return a;
543
544   if(b == -1)
545      return lp_build_negate(bld, a);
546
547   if(b == 2 && bld->type.floating)
548      return lp_build_add(bld, a, a);
549
550   if(util_is_power_of_two(b)) {
551      unsigned shift = ffs(b) - 1;
552
553      if(bld->type.floating) {
554#if 0
555         /*
556          * Power of two multiplication by directly manipulating the mantissa.
557          *
558          * XXX: This might not be always faster, it will introduce a small error
559          * for multiplication by zero, and it will produce wrong results
560          * for Inf and NaN.
561          */
562         unsigned mantissa = lp_mantissa(bld->type);
563         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
564         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
565         a = LLVMBuildAdd(builder, a, factor, "");
566         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
567         return a;
568#endif
569      }
570      else {
571         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
572         return LLVMBuildShl(builder, a, factor, "");
573      }
574   }
575
576   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
577   return lp_build_mul(bld, a, factor);
578}
579
580
581/**
582 * Generate a / b
583 */
584LLVMValueRef
585lp_build_div(struct lp_build_context *bld,
586             LLVMValueRef a,
587             LLVMValueRef b)
588{
589   LLVMBuilderRef builder = bld->gallivm->builder;
590   const struct lp_type type = bld->type;
591
592   assert(lp_check_value(type, a));
593   assert(lp_check_value(type, b));
594
595   if(a == bld->zero)
596      return bld->zero;
597   if(a == bld->one)
598      return lp_build_rcp(bld, b);
599   if(b == bld->zero)
600      return bld->undef;
601   if(b == bld->one)
602      return a;
603   if(a == bld->undef || b == bld->undef)
604      return bld->undef;
605
606   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
607      if (type.floating)
608         return LLVMConstFDiv(a, b);
609      else if (type.sign)
610         return LLVMConstSDiv(a, b);
611      else
612         return LLVMConstUDiv(a, b);
613   }
614
615   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
616      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
617
618   if (type.floating)
619      return LLVMBuildFDiv(builder, a, b, "");
620   else if (type.sign)
621      return LLVMBuildSDiv(builder, a, b, "");
622   else
623      return LLVMBuildUDiv(builder, a, b, "");
624}
625
626
627/**
628 * Linear interpolation -- without any checks.
629 *
630 * @sa http://www.stereopsis.com/doubleblend.html
631 */
632static INLINE LLVMValueRef
633lp_build_lerp_simple(struct lp_build_context *bld,
634                     LLVMValueRef x,
635                     LLVMValueRef v0,
636                     LLVMValueRef v1)
637{
638   LLVMBuilderRef builder = bld->gallivm->builder;
639   LLVMValueRef delta;
640   LLVMValueRef res;
641
642   assert(lp_check_value(bld->type, x));
643   assert(lp_check_value(bld->type, v0));
644   assert(lp_check_value(bld->type, v1));
645
646   delta = lp_build_sub(bld, v1, v0);
647
648   res = lp_build_mul(bld, x, delta);
649
650   res = lp_build_add(bld, v0, res);
651
652   if (bld->type.fixed) {
653      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
654       * but it will be wrong for other uses. Basically we need a more
655       * powerful lp_type, capable of further distinguishing the values
656       * interpretation from the value storage. */
657      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
658   }
659
660   return res;
661}
662
663
664/**
665 * Linear interpolation.
666 */
667LLVMValueRef
668lp_build_lerp(struct lp_build_context *bld,
669              LLVMValueRef x,
670              LLVMValueRef v0,
671              LLVMValueRef v1)
672{
673   LLVMBuilderRef builder = bld->gallivm->builder;
674   const struct lp_type type = bld->type;
675   LLVMValueRef res;
676
677   assert(lp_check_value(type, x));
678   assert(lp_check_value(type, v0));
679   assert(lp_check_value(type, v1));
680
681   if (type.norm) {
682      struct lp_type wide_type;
683      struct lp_build_context wide_bld;
684      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
685      LLVMValueRef shift;
686
687      assert(type.length >= 2);
688      assert(!type.sign);
689
690      /*
691       * Create a wider type, enough to hold the intermediate result of the
692       * multiplication.
693       */
694      memset(&wide_type, 0, sizeof wide_type);
695      wide_type.fixed  = TRUE;
696      wide_type.width  = type.width*2;
697      wide_type.length = type.length/2;
698
699      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
700
701      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
702      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
703      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
704
705      /*
706       * Scale x from [0, 255] to [0, 256]
707       */
708
709      shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
710
711      xl = lp_build_add(&wide_bld, xl,
712                        LLVMBuildAShr(builder, xl, shift, ""));
713      xh = lp_build_add(&wide_bld, xh,
714                        LLVMBuildAShr(builder, xh, shift, ""));
715
716      /*
717       * Lerp both halves.
718       */
719
720      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
721      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
722
723      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
724   } else {
725      res = lp_build_lerp_simple(bld, x, v0, v1);
726   }
727
728   return res;
729}
730
731
732LLVMValueRef
733lp_build_lerp_2d(struct lp_build_context *bld,
734                 LLVMValueRef x,
735                 LLVMValueRef y,
736                 LLVMValueRef v00,
737                 LLVMValueRef v01,
738                 LLVMValueRef v10,
739                 LLVMValueRef v11)
740{
741   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
742   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
743   return lp_build_lerp(bld, y, v0, v1);
744}
745
746
747/**
748 * Generate min(a, b)
749 * Do checks for special cases.
750 */
751LLVMValueRef
752lp_build_min(struct lp_build_context *bld,
753             LLVMValueRef a,
754             LLVMValueRef b)
755{
756   assert(lp_check_value(bld->type, a));
757   assert(lp_check_value(bld->type, b));
758
759   if(a == bld->undef || b == bld->undef)
760      return bld->undef;
761
762   if(a == b)
763      return a;
764
765   if(bld->type.norm) {
766      if(a == bld->zero || b == bld->zero)
767         return bld->zero;
768      if(a == bld->one)
769         return b;
770      if(b == bld->one)
771         return a;
772   }
773
774   return lp_build_min_simple(bld, a, b);
775}
776
777
778/**
779 * Generate max(a, b)
780 * Do checks for special cases.
781 */
782LLVMValueRef
783lp_build_max(struct lp_build_context *bld,
784             LLVMValueRef a,
785             LLVMValueRef b)
786{
787   assert(lp_check_value(bld->type, a));
788   assert(lp_check_value(bld->type, b));
789
790   if(a == bld->undef || b == bld->undef)
791      return bld->undef;
792
793   if(a == b)
794      return a;
795
796   if(bld->type.norm) {
797      if(a == bld->one || b == bld->one)
798         return bld->one;
799      if(a == bld->zero)
800         return b;
801      if(b == bld->zero)
802         return a;
803   }
804
805   return lp_build_max_simple(bld, a, b);
806}
807
808
809/**
810 * Generate clamp(a, min, max)
811 * Do checks for special cases.
812 */
813LLVMValueRef
814lp_build_clamp(struct lp_build_context *bld,
815               LLVMValueRef a,
816               LLVMValueRef min,
817               LLVMValueRef max)
818{
819   assert(lp_check_value(bld->type, a));
820   assert(lp_check_value(bld->type, min));
821   assert(lp_check_value(bld->type, max));
822
823   a = lp_build_min(bld, a, max);
824   a = lp_build_max(bld, a, min);
825   return a;
826}
827
828
829/**
830 * Generate abs(a)
831 */
832LLVMValueRef
833lp_build_abs(struct lp_build_context *bld,
834             LLVMValueRef a)
835{
836   LLVMBuilderRef builder = bld->gallivm->builder;
837   const struct lp_type type = bld->type;
838   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
839
840   assert(lp_check_value(type, a));
841
842   if(!type.sign)
843      return a;
844
845   if(type.floating) {
846      /* Mask out the sign bit */
847      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
848      unsigned long long absMask = ~(1ULL << (type.width - 1));
849      LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
850      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
851      a = LLVMBuildAnd(builder, a, mask, "");
852      a = LLVMBuildBitCast(builder, a, vec_type, "");
853      return a;
854   }
855
856   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
857      switch(type.width) {
858      case 8:
859         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
860      case 16:
861         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
862      case 32:
863         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
864      }
865   }
866
867   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
868}
869
870
871LLVMValueRef
872lp_build_negate(struct lp_build_context *bld,
873                LLVMValueRef a)
874{
875   LLVMBuilderRef builder = bld->gallivm->builder;
876
877   assert(lp_check_value(bld->type, a));
878
879#if HAVE_LLVM >= 0x0207
880   if (bld->type.floating)
881      a = LLVMBuildFNeg(builder, a, "");
882   else
883#endif
884      a = LLVMBuildNeg(builder, a, "");
885
886   return a;
887}
888
889
890/** Return -1, 0 or +1 depending on the sign of a */
891LLVMValueRef
892lp_build_sgn(struct lp_build_context *bld,
893             LLVMValueRef a)
894{
895   LLVMBuilderRef builder = bld->gallivm->builder;
896   const struct lp_type type = bld->type;
897   LLVMValueRef cond;
898   LLVMValueRef res;
899
900   assert(lp_check_value(type, a));
901
902   /* Handle non-zero case */
903   if(!type.sign) {
904      /* if not zero then sign must be positive */
905      res = bld->one;
906   }
907   else if(type.floating) {
908      LLVMTypeRef vec_type;
909      LLVMTypeRef int_type;
910      LLVMValueRef mask;
911      LLVMValueRef sign;
912      LLVMValueRef one;
913      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
914
915      int_type = lp_build_int_vec_type(bld->gallivm, type);
916      vec_type = lp_build_vec_type(bld->gallivm, type);
917      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
918
919      /* Take the sign bit and add it to 1 constant */
920      sign = LLVMBuildBitCast(builder, a, int_type, "");
921      sign = LLVMBuildAnd(builder, sign, mask, "");
922      one = LLVMConstBitCast(bld->one, int_type);
923      res = LLVMBuildOr(builder, sign, one, "");
924      res = LLVMBuildBitCast(builder, res, vec_type, "");
925   }
926   else
927   {
928      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
929      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
930      res = lp_build_select(bld, cond, bld->one, minus_one);
931   }
932
933   /* Handle zero */
934   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
935   res = lp_build_select(bld, cond, bld->zero, res);
936
937   return res;
938}
939
940
941/**
942 * Set the sign of float vector 'a' according to 'sign'.
943 * If sign==0, return abs(a).
944 * If sign==1, return -abs(a);
945 * Other values for sign produce undefined results.
946 */
947LLVMValueRef
948lp_build_set_sign(struct lp_build_context *bld,
949                  LLVMValueRef a, LLVMValueRef sign)
950{
951   LLVMBuilderRef builder = bld->gallivm->builder;
952   const struct lp_type type = bld->type;
953   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
954   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
955   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
956   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
957                             ~((unsigned long long) 1 << (type.width - 1)));
958   LLVMValueRef val, res;
959
960   assert(type.floating);
961   assert(lp_check_value(type, a));
962
963   /* val = reinterpret_cast<int>(a) */
964   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
965   /* val = val & mask */
966   val = LLVMBuildAnd(builder, val, mask, "");
967   /* sign = sign << shift */
968   sign = LLVMBuildShl(builder, sign, shift, "");
969   /* res = val | sign */
970   res = LLVMBuildOr(builder, val, sign, "");
971   /* res = reinterpret_cast<float>(res) */
972   res = LLVMBuildBitCast(builder, res, vec_type, "");
973
974   return res;
975}
976
977
978/**
979 * Convert vector of (or scalar) int to vector of (or scalar) float.
980 */
981LLVMValueRef
982lp_build_int_to_float(struct lp_build_context *bld,
983                      LLVMValueRef a)
984{
985   LLVMBuilderRef builder = bld->gallivm->builder;
986   const struct lp_type type = bld->type;
987   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
988
989   assert(type.floating);
990
991   return LLVMBuildSIToFP(builder, a, vec_type, "");
992}
993
994
995
996enum lp_build_round_sse41_mode
997{
998   LP_BUILD_ROUND_SSE41_NEAREST = 0,
999   LP_BUILD_ROUND_SSE41_FLOOR = 1,
1000   LP_BUILD_ROUND_SSE41_CEIL = 2,
1001   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1002};
1003
1004
1005/**
1006 * Helper for SSE4.1's ROUNDxx instructions.
1007 *
1008 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1009 * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1010 */
1011static INLINE LLVMValueRef
1012lp_build_round_sse41(struct lp_build_context *bld,
1013                     LLVMValueRef a,
1014                     enum lp_build_round_sse41_mode mode)
1015{
1016   LLVMBuilderRef builder = bld->gallivm->builder;
1017   const struct lp_type type = bld->type;
1018   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1019   const char *intrinsic;
1020   LLVMValueRef res;
1021
1022   assert(type.floating);
1023
1024   assert(lp_check_value(type, a));
1025   assert(util_cpu_caps.has_sse4_1);
1026
1027   if (type.length == 1) {
1028      LLVMTypeRef vec_type;
1029      LLVMValueRef undef;
1030      LLVMValueRef args[3];
1031      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1032
1033      switch(type.width) {
1034      case 32:
1035         intrinsic = "llvm.x86.sse41.round.ss";
1036         break;
1037      case 64:
1038         intrinsic = "llvm.x86.sse41.round.sd";
1039         break;
1040      default:
1041         assert(0);
1042         return bld->undef;
1043      }
1044
1045      vec_type = LLVMVectorType(bld->elem_type, 4);
1046
1047      undef = LLVMGetUndef(vec_type);
1048
1049      args[0] = undef;
1050      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1051      args[2] = LLVMConstInt(i32t, mode, 0);
1052
1053      res = lp_build_intrinsic(builder, intrinsic,
1054                               vec_type, args, Elements(args));
1055
1056      res = LLVMBuildExtractElement(builder, res, index0, "");
1057   }
1058   else {
1059      assert(type.width*type.length == 128);
1060
1061      switch(type.width) {
1062      case 32:
1063         intrinsic = "llvm.x86.sse41.round.ps";
1064         break;
1065      case 64:
1066         intrinsic = "llvm.x86.sse41.round.pd";
1067         break;
1068      default:
1069         assert(0);
1070         return bld->undef;
1071      }
1072
1073      res = lp_build_intrinsic_binary(builder, intrinsic,
1074                                      bld->vec_type, a,
1075                                      LLVMConstInt(i32t, mode, 0));
1076   }
1077
1078   return res;
1079}
1080
1081
1082static INLINE LLVMValueRef
1083lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1084                             LLVMValueRef a)
1085{
1086   LLVMBuilderRef builder = bld->gallivm->builder;
1087   const struct lp_type type = bld->type;
1088   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1089   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1090   const char *intrinsic;
1091   LLVMValueRef res;
1092
1093   assert(type.floating);
1094   /* using the double precision conversions is a bit more complicated */
1095   assert(type.width == 32);
1096
1097   assert(lp_check_value(type, a));
1098   assert(util_cpu_caps.has_sse2);
1099
1100   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1101   if (type.length == 1) {
1102      LLVMTypeRef vec_type;
1103      LLVMValueRef undef;
1104      LLVMValueRef arg;
1105      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1106
1107      vec_type = LLVMVectorType(bld->elem_type, 4);
1108
1109      intrinsic = "llvm.x86.sse.cvtss2si";
1110
1111      undef = LLVMGetUndef(vec_type);
1112
1113      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1114
1115      res = lp_build_intrinsic_unary(builder, intrinsic,
1116                                     ret_type, arg);
1117   }
1118   else {
1119      assert(type.width*type.length == 128);
1120
1121      intrinsic = "llvm.x86.sse2.cvtps2dq";
1122
1123      res = lp_build_intrinsic_unary(builder, intrinsic,
1124                                     ret_type, a);
1125   }
1126
1127   return res;
1128}
1129
1130
1131/**
1132 * Return the integer part of a float (vector) value (== round toward zero).
1133 * The returned value is a float (vector).
1134 * Ex: trunc(-1.5) = -1.0
1135 */
1136LLVMValueRef
1137lp_build_trunc(struct lp_build_context *bld,
1138               LLVMValueRef a)
1139{
1140   LLVMBuilderRef builder = bld->gallivm->builder;
1141   const struct lp_type type = bld->type;
1142
1143   assert(type.floating);
1144   assert(lp_check_value(type, a));
1145
1146   if (util_cpu_caps.has_sse4_1 &&
1147       (type.length == 1 || type.width*type.length == 128)) {
1148      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1149   }
1150   else {
1151      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1152      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1153      LLVMValueRef res;
1154      res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1155      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1156      return res;
1157   }
1158}
1159
1160
1161/**
1162 * Return float (vector) rounded to nearest integer (vector).  The returned
1163 * value is a float (vector).
1164 * Ex: round(0.9) = 1.0
1165 * Ex: round(-1.5) = -2.0
1166 */
1167LLVMValueRef
1168lp_build_round(struct lp_build_context *bld,
1169               LLVMValueRef a)
1170{
1171   LLVMBuilderRef builder = bld->gallivm->builder;
1172   const struct lp_type type = bld->type;
1173
1174   assert(type.floating);
1175   assert(lp_check_value(type, a));
1176
1177   if (util_cpu_caps.has_sse4_1 &&
1178       (type.length == 1 || type.width*type.length == 128)) {
1179      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1180   }
1181   else {
1182      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1183      LLVMValueRef res;
1184      res = lp_build_iround(bld, a);
1185      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1186      return res;
1187   }
1188}
1189
1190
1191/**
1192 * Return floor of float (vector), result is a float (vector)
1193 * Ex: floor(1.1) = 1.0
1194 * Ex: floor(-1.1) = -2.0
1195 */
1196LLVMValueRef
1197lp_build_floor(struct lp_build_context *bld,
1198               LLVMValueRef a)
1199{
1200   LLVMBuilderRef builder = bld->gallivm->builder;
1201   const struct lp_type type = bld->type;
1202
1203   assert(type.floating);
1204   assert(lp_check_value(type, a));
1205
1206   if (util_cpu_caps.has_sse4_1 &&
1207       (type.length == 1 || type.width*type.length == 128)) {
1208      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1209   }
1210   else {
1211      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1212      LLVMValueRef res;
1213      res = lp_build_ifloor(bld, a);
1214      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1215      return res;
1216   }
1217}
1218
1219
1220/**
1221 * Return ceiling of float (vector), returning float (vector).
1222 * Ex: ceil( 1.1) = 2.0
1223 * Ex: ceil(-1.1) = -1.0
1224 */
1225LLVMValueRef
1226lp_build_ceil(struct lp_build_context *bld,
1227              LLVMValueRef a)
1228{
1229   LLVMBuilderRef builder = bld->gallivm->builder;
1230   const struct lp_type type = bld->type;
1231
1232   assert(type.floating);
1233   assert(lp_check_value(type, a));
1234
1235   if (util_cpu_caps.has_sse4_1 &&
1236       (type.length == 1 || type.width*type.length == 128)) {
1237      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1238   }
1239   else {
1240      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1241      LLVMValueRef res;
1242      res = lp_build_iceil(bld, a);
1243      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1244      return res;
1245   }
1246}
1247
1248
1249/**
1250 * Return fractional part of 'a' computed as a - floor(a)
1251 * Typically used in texture coord arithmetic.
1252 */
1253LLVMValueRef
1254lp_build_fract(struct lp_build_context *bld,
1255               LLVMValueRef a)
1256{
1257   assert(bld->type.floating);
1258   return lp_build_sub(bld, a, lp_build_floor(bld, a));
1259}
1260
1261
1262/**
1263 * Return the integer part of a float (vector) value (== round toward zero).
1264 * The returned value is an integer (vector).
1265 * Ex: itrunc(-1.5) = -1
1266 */
1267LLVMValueRef
1268lp_build_itrunc(struct lp_build_context *bld,
1269                LLVMValueRef a)
1270{
1271   LLVMBuilderRef builder = bld->gallivm->builder;
1272   const struct lp_type type = bld->type;
1273   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1274
1275   assert(type.floating);
1276   assert(lp_check_value(type, a));
1277
1278   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1279}
1280
1281
1282/**
1283 * Return float (vector) rounded to nearest integer (vector).  The returned
1284 * value is an integer (vector).
1285 * Ex: iround(0.9) = 1
1286 * Ex: iround(-1.5) = -2
1287 */
1288LLVMValueRef
1289lp_build_iround(struct lp_build_context *bld,
1290                LLVMValueRef a)
1291{
1292   LLVMBuilderRef builder = bld->gallivm->builder;
1293   const struct lp_type type = bld->type;
1294   LLVMTypeRef int_vec_type = bld->int_vec_type;
1295   LLVMValueRef res;
1296
1297   assert(type.floating);
1298
1299   assert(lp_check_value(type, a));
1300
1301   if (util_cpu_caps.has_sse2 &&
1302       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1303      return lp_build_iround_nearest_sse2(bld, a);
1304   }
1305   else if (util_cpu_caps.has_sse4_1 &&
1306       (type.length == 1 || type.width*type.length == 128)) {
1307      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1308   }
1309   else {
1310      LLVMValueRef half;
1311
1312      half = lp_build_const_vec(bld->gallivm, type, 0.5);
1313
1314      if (type.sign) {
1315         LLVMTypeRef vec_type = bld->vec_type;
1316         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1317                                    (unsigned long long)1 << (type.width - 1));
1318         LLVMValueRef sign;
1319
1320         /* get sign bit */
1321         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1322         sign = LLVMBuildAnd(builder, sign, mask, "");
1323
1324         /* sign * 0.5 */
1325         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1326         half = LLVMBuildOr(builder, sign, half, "");
1327         half = LLVMBuildBitCast(builder, half, vec_type, "");
1328      }
1329
1330      res = LLVMBuildFAdd(builder, a, half, "");
1331   }
1332
1333   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1334
1335   return res;
1336}
1337
1338
1339/**
1340 * Return floor of float (vector), result is an int (vector)
1341 * Ex: ifloor(1.1) = 1.0
1342 * Ex: ifloor(-1.1) = -2.0
1343 */
1344LLVMValueRef
1345lp_build_ifloor(struct lp_build_context *bld,
1346                LLVMValueRef a)
1347{
1348   LLVMBuilderRef builder = bld->gallivm->builder;
1349   const struct lp_type type = bld->type;
1350   LLVMTypeRef int_vec_type = bld->int_vec_type;
1351   LLVMValueRef res;
1352
1353   assert(type.floating);
1354   assert(lp_check_value(type, a));
1355
1356   if (util_cpu_caps.has_sse4_1 &&
1357       (type.length == 1 || type.width*type.length == 128)) {
1358      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1359   }
1360   else {
1361      res = a;
1362
1363      if (type.sign) {
1364         /* Take the sign bit and add it to 1 constant */
1365         LLVMTypeRef vec_type = bld->vec_type;
1366         unsigned mantissa = lp_mantissa(type);
1367         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1368                                  (unsigned long long)1 << (type.width - 1));
1369         LLVMValueRef sign;
1370         LLVMValueRef offset;
1371
1372         /* sign = a < 0 ? ~0 : 0 */
1373         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1374         sign = LLVMBuildAnd(builder, sign, mask, "");
1375         sign = LLVMBuildAShr(builder, sign,
1376                              lp_build_const_int_vec(bld->gallivm, type,
1377                                                     type.width - 1),
1378                              "ifloor.sign");
1379
1380         /* offset = -0.99999(9)f */
1381         offset = lp_build_const_vec(bld->gallivm, type,
1382                                     -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1383         offset = LLVMConstBitCast(offset, int_vec_type);
1384
1385         /* offset = a < 0 ? offset : 0.0f */
1386         offset = LLVMBuildAnd(builder, offset, sign, "");
1387         offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1388
1389         res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1390      }
1391   }
1392
1393   /* round to nearest (toward zero) */
1394   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1395
1396   return res;
1397}
1398
1399
1400/**
1401 * Return ceiling of float (vector), returning int (vector).
1402 * Ex: iceil( 1.1) = 2
1403 * Ex: iceil(-1.1) = -1
1404 */
1405LLVMValueRef
1406lp_build_iceil(struct lp_build_context *bld,
1407               LLVMValueRef a)
1408{
1409   LLVMBuilderRef builder = bld->gallivm->builder;
1410   const struct lp_type type = bld->type;
1411   LLVMTypeRef int_vec_type = bld->int_vec_type;
1412   LLVMValueRef res;
1413
1414   assert(type.floating);
1415   assert(lp_check_value(type, a));
1416
1417   if (util_cpu_caps.has_sse4_1 &&
1418       (type.length == 1 || type.width*type.length == 128)) {
1419      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1420   }
1421   else {
1422      LLVMTypeRef vec_type = bld->vec_type;
1423      unsigned mantissa = lp_mantissa(type);
1424      LLVMValueRef offset;
1425
1426      /* offset = 0.99999(9)f */
1427      offset = lp_build_const_vec(bld->gallivm, type,
1428                                  (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1429
1430      if (type.sign) {
1431         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1432                                (unsigned long long)1 << (type.width - 1));
1433         LLVMValueRef sign;
1434
1435         /* sign = a < 0 ? 0 : ~0 */
1436         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1437         sign = LLVMBuildAnd(builder, sign, mask, "");
1438         sign = LLVMBuildAShr(builder, sign,
1439                              lp_build_const_int_vec(bld->gallivm, type,
1440                                                     type.width - 1),
1441                              "iceil.sign");
1442         sign = LLVMBuildNot(builder, sign, "iceil.not");
1443
1444         /* offset = a < 0 ? 0.0 : offset */
1445         offset = LLVMConstBitCast(offset, int_vec_type);
1446         offset = LLVMBuildAnd(builder, offset, sign, "");
1447         offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1448      }
1449
1450      res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1451   }
1452
1453   /* round to nearest (toward zero) */
1454   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1455
1456   return res;
1457}
1458
1459
1460/**
1461 * Combined ifloor() & fract().
1462 *
1463 * Preferred to calling the functions separately, as it will ensure that the
1464 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1465 */
1466void
1467lp_build_ifloor_fract(struct lp_build_context *bld,
1468                      LLVMValueRef a,
1469                      LLVMValueRef *out_ipart,
1470                      LLVMValueRef *out_fpart)
1471{
1472   LLVMBuilderRef builder = bld->gallivm->builder;
1473   const struct lp_type type = bld->type;
1474   LLVMValueRef ipart;
1475
1476   assert(type.floating);
1477   assert(lp_check_value(type, a));
1478
1479   if (util_cpu_caps.has_sse4_1 &&
1480       (type.length == 1 || type.width*type.length == 128)) {
1481      /*
1482       * floor() is easier.
1483       */
1484
1485      ipart = lp_build_floor(bld, a);
1486      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1487      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1488   }
1489   else {
1490      /*
1491       * ifloor() is easier.
1492       */
1493
1494      *out_ipart = lp_build_ifloor(bld, a);
1495      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1496      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1497   }
1498}
1499
1500
1501LLVMValueRef
1502lp_build_sqrt(struct lp_build_context *bld,
1503              LLVMValueRef a)
1504{
1505   LLVMBuilderRef builder = bld->gallivm->builder;
1506   const struct lp_type type = bld->type;
1507   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1508   char intrinsic[32];
1509
1510   assert(lp_check_value(type, a));
1511
1512   /* TODO: optimize the constant case */
1513   /* TODO: optimize the constant case */
1514
1515   assert(type.floating);
1516   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1517
1518   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1519}
1520
1521
1522/**
1523 * Do one Newton-Raphson step to improve reciprocate precision:
1524 *
1525 *   x_{i+1} = x_i * (2 - a * x_i)
1526 *
1527 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1528 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1529 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1530 * halo. It would be necessary to clamp the argument to prevent this.
1531 *
1532 * See also:
1533 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1534 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1535 */
1536static INLINE LLVMValueRef
1537lp_build_rcp_refine(struct lp_build_context *bld,
1538                    LLVMValueRef a,
1539                    LLVMValueRef rcp_a)
1540{
1541   LLVMBuilderRef builder = bld->gallivm->builder;
1542   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1543   LLVMValueRef res;
1544
1545   res = LLVMBuildFMul(builder, a, rcp_a, "");
1546   res = LLVMBuildFSub(builder, two, res, "");
1547   res = LLVMBuildFMul(builder, rcp_a, res, "");
1548
1549   return res;
1550}
1551
1552
1553LLVMValueRef
1554lp_build_rcp(struct lp_build_context *bld,
1555             LLVMValueRef a)
1556{
1557   LLVMBuilderRef builder = bld->gallivm->builder;
1558   const struct lp_type type = bld->type;
1559
1560   assert(lp_check_value(type, a));
1561
1562   if(a == bld->zero)
1563      return bld->undef;
1564   if(a == bld->one)
1565      return bld->one;
1566   if(a == bld->undef)
1567      return bld->undef;
1568
1569   assert(type.floating);
1570
1571   if(LLVMIsConstant(a))
1572      return LLVMConstFDiv(bld->one, a);
1573
1574   /*
1575    * We don't use RCPPS because:
1576    * - it only has 10bits of precision
1577    * - it doesn't even get the reciprocate of 1.0 exactly
1578    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1579    * - for recent processors the benefit over DIVPS is marginal, a case
1580    *   depedent
1581    *
1582    * We could still use it on certain processors if benchmarks show that the
1583    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1584    * particular uses that require less workarounds.
1585    */
1586
1587   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1588      const unsigned num_iterations = 0;
1589      LLVMValueRef res;
1590      unsigned i;
1591
1592      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1593
1594      for (i = 0; i < num_iterations; ++i) {
1595         res = lp_build_rcp_refine(bld, a, res);
1596      }
1597
1598      return res;
1599   }
1600
1601   return LLVMBuildFDiv(builder, bld->one, a, "");
1602}
1603
1604
1605/**
1606 * Do one Newton-Raphson step to improve rsqrt precision:
1607 *
1608 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1609 *
1610 * See also:
1611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1612 */
1613static INLINE LLVMValueRef
1614lp_build_rsqrt_refine(struct lp_build_context *bld,
1615                      LLVMValueRef a,
1616                      LLVMValueRef rsqrt_a)
1617{
1618   LLVMBuilderRef builder = bld->gallivm->builder;
1619   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1620   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1621   LLVMValueRef res;
1622
1623   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1624   res = LLVMBuildFMul(builder, a, res, "");
1625   res = LLVMBuildFSub(builder, three, res, "");
1626   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1627   res = LLVMBuildFMul(builder, half, res, "");
1628
1629   return res;
1630}
1631
1632
1633/**
1634 * Generate 1/sqrt(a)
1635 */
1636LLVMValueRef
1637lp_build_rsqrt(struct lp_build_context *bld,
1638               LLVMValueRef a)
1639{
1640   LLVMBuilderRef builder = bld->gallivm->builder;
1641   const struct lp_type type = bld->type;
1642
1643   assert(lp_check_value(type, a));
1644
1645   assert(type.floating);
1646
1647   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1648      const unsigned num_iterations = 0;
1649      LLVMValueRef res;
1650      unsigned i;
1651
1652      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1653
1654      for (i = 0; i < num_iterations; ++i) {
1655         res = lp_build_rsqrt_refine(bld, a, res);
1656      }
1657
1658      return res;
1659   }
1660
1661   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1662}
1663
1664
1665static inline LLVMValueRef
1666lp_build_const_v4si(struct gallivm_state *gallivm, unsigned long value)
1667{
1668   LLVMValueRef element = lp_build_const_int32(gallivm, value);
1669   LLVMValueRef elements[4] = { element, element, element, element };
1670   return LLVMConstVector(elements, 4);
1671}
1672
1673static inline LLVMValueRef
1674lp_build_const_v4sf(struct gallivm_state *gallivm, float value)
1675{
1676   LLVMValueRef element = lp_build_const_float(gallivm, value);
1677   LLVMValueRef elements[4] = { element, element, element, element };
1678   return LLVMConstVector(elements, 4);
1679}
1680
1681
1682/**
1683 * Generate sin(a) using SSE2
1684 */
1685LLVMValueRef
1686lp_build_sin(struct lp_build_context *bld,
1687             LLVMValueRef a)
1688{
1689   LLVMBuilderRef builder = bld->gallivm->builder;
1690   struct gallivm_state *gallivm = bld->gallivm;
1691   struct lp_type int_type = lp_int_type(bld->type);
1692   LLVMBuilderRef b = builder;
1693   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(bld->gallivm->context), 4);
1694   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32TypeInContext(bld->gallivm->context), 4);
1695
1696   /*
1697    *  take the absolute value,
1698    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1699    */
1700
1701   LLVMValueRef inv_sig_mask = lp_build_const_v4si(bld->gallivm, ~0x80000000);
1702   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1703
1704   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1705   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1706
1707   /*
1708    * extract the sign bit (upper one)
1709    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1710    */
1711   LLVMValueRef sig_mask = lp_build_const_v4si(bld->gallivm, 0x80000000);
1712   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1713
1714   /*
1715    * scale by 4/Pi
1716    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1717    */
1718
1719   LLVMValueRef FOPi = lp_build_const_v4sf(gallivm, 1.27323954473516);
1720   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1721
1722   /*
1723    * store the integer part of y in mm0
1724    * emm2 = _mm_cvttps_epi32(y);
1725    */
1726
1727   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1728
1729   /*
1730    * j=(j+1) & (~1) (see the cephes sources)
1731    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1732    */
1733
1734   LLVMValueRef all_one = lp_build_const_v4si(bld->gallivm, 1);
1735   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1736   /*
1737    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1738    */
1739   LLVMValueRef inv_one = lp_build_const_v4si(bld->gallivm, ~1);
1740   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1741
1742   /*
1743    * y = _mm_cvtepi32_ps(emm2);
1744    */
1745   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1746
1747   /* get the swap sign flag
1748    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1749    */
1750   LLVMValueRef pi32_4 = lp_build_const_v4si(bld->gallivm, 4);
1751   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1752
1753   /*
1754    * emm2 = _mm_slli_epi32(emm0, 29);
1755    */
1756   LLVMValueRef const_29 = lp_build_const_v4si(bld->gallivm, 29);
1757   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1758
1759   /*
1760    * get the polynom selection mask
1761    * there is one polynom for 0 <= x <= Pi/4
1762    * and another one for Pi/4<x<=Pi/2
1763    * Both branches will be computed.
1764    *
1765    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1766    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1767    */
1768
1769   LLVMValueRef pi32_2 = lp_build_const_v4si(bld->gallivm, 2);
1770   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1771   LLVMValueRef poly_mask = lp_build_compare(bld->gallivm,
1772                                             int_type, PIPE_FUNC_EQUAL,
1773                                             emm2_3, lp_build_const_v4si(bld->gallivm, 0));
1774   /*
1775    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1776    */
1777   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1778
1779   /*
1780    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1781    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1782    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1783    */
1784   LLVMValueRef DP1 = lp_build_const_v4sf(gallivm, -0.78515625);
1785   LLVMValueRef DP2 = lp_build_const_v4sf(gallivm, -2.4187564849853515625e-4);
1786   LLVMValueRef DP3 = lp_build_const_v4sf(gallivm, -3.77489497744594108e-8);
1787
1788   /*
1789    * The magic pass: "Extended precision modular arithmetic"
1790    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1791    * xmm1 = _mm_mul_ps(y, xmm1);
1792    * xmm2 = _mm_mul_ps(y, xmm2);
1793    * xmm3 = _mm_mul_ps(y, xmm3);
1794    */
1795   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1796   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1797   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1798
1799   /*
1800    * x = _mm_add_ps(x, xmm1);
1801    * x = _mm_add_ps(x, xmm2);
1802    * x = _mm_add_ps(x, xmm3);
1803    */
1804
1805   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1806   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1807   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1808
1809   /*
1810    * Evaluate the first polynom  (0 <= x <= Pi/4)
1811    *
1812    * z = _mm_mul_ps(x,x);
1813    */
1814   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1815
1816   /*
1817    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1818    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1819    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1820    */
1821   LLVMValueRef coscof_p0 = lp_build_const_v4sf(gallivm, 2.443315711809948E-005);
1822   LLVMValueRef coscof_p1 = lp_build_const_v4sf(gallivm, -1.388731625493765E-003);
1823   LLVMValueRef coscof_p2 = lp_build_const_v4sf(gallivm, 4.166664568298827E-002);
1824
1825   /*
1826    * y = *(v4sf*)_ps_coscof_p0;
1827    * y = _mm_mul_ps(y, z);
1828    */
1829   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1830   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1831   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1832   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1833   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1834   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1835
1836
1837   /*
1838    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1839    * y = _mm_sub_ps(y, tmp);
1840    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1841    */
1842   LLVMValueRef half = lp_build_const_v4sf(gallivm, 0.5);
1843   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1844   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1845   LLVMValueRef one = lp_build_const_v4sf(gallivm, 1.0);
1846   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1847
1848   /*
1849    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1850    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1851    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1852    */
1853   LLVMValueRef sincof_p0 = lp_build_const_v4sf(gallivm, -1.9515295891E-4);
1854   LLVMValueRef sincof_p1 = lp_build_const_v4sf(gallivm, 8.3321608736E-3);
1855   LLVMValueRef sincof_p2 = lp_build_const_v4sf(gallivm, -1.6666654611E-1);
1856
1857   /*
1858    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1859    *
1860    * y2 = *(v4sf*)_ps_sincof_p0;
1861    * y2 = _mm_mul_ps(y2, z);
1862    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1863    * y2 = _mm_mul_ps(y2, z);
1864    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1865    * y2 = _mm_mul_ps(y2, z);
1866    * y2 = _mm_mul_ps(y2, x);
1867    * y2 = _mm_add_ps(y2, x);
1868    */
1869
1870   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1871   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1872   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1873   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1874   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1875   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1876   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1877
1878   /*
1879    * select the correct result from the two polynoms
1880    * xmm3 = poly_mask;
1881    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1882    * y = _mm_andnot_ps(xmm3, y);
1883    * y = _mm_add_ps(y,y2);
1884    */
1885   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1886   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1887   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1888   LLVMValueRef inv = lp_build_const_v4si(bld->gallivm, ~0);
1889   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1890   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1891   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1892
1893   /*
1894    * update the sign
1895    * y = _mm_xor_ps(y, sign_bit);
1896    */
1897   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1898   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1899   return y_result;
1900}
1901
1902
1903/**
1904 * Generate cos(a) using SSE2
1905 */
1906LLVMValueRef
1907lp_build_cos(struct lp_build_context *bld,
1908             LLVMValueRef a)
1909{
1910   LLVMBuilderRef builder = bld->gallivm->builder;
1911   struct gallivm_state *gallivm = bld->gallivm;
1912   struct lp_type int_type = lp_int_type(bld->type);
1913   LLVMBuilderRef b = builder;
1914   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(bld->gallivm->context), 4);
1915   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32TypeInContext(bld->gallivm->context), 4);
1916
1917   /*
1918    *  take the absolute value,
1919    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1920    */
1921
1922   LLVMValueRef inv_sig_mask = lp_build_const_v4si(bld->gallivm, ~0x80000000);
1923   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1924
1925   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1926   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1927
1928   /*
1929    * scale by 4/Pi
1930    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1931    */
1932
1933   LLVMValueRef FOPi = lp_build_const_v4sf(gallivm, 1.27323954473516);
1934   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1935
1936   /*
1937    * store the integer part of y in mm0
1938    * emm2 = _mm_cvttps_epi32(y);
1939    */
1940
1941   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1942
1943   /*
1944    * j=(j+1) & (~1) (see the cephes sources)
1945    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1946    */
1947
1948   LLVMValueRef all_one = lp_build_const_v4si(bld->gallivm, 1);
1949   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1950   /*
1951    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1952    */
1953   LLVMValueRef inv_one = lp_build_const_v4si(bld->gallivm, ~1);
1954   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1955
1956   /*
1957    * y = _mm_cvtepi32_ps(emm2);
1958    */
1959   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1960
1961
1962   /*
1963    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1964    */
1965   LLVMValueRef const_2 = lp_build_const_v4si(bld->gallivm, 2);
1966   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1967
1968
1969   /* get the swap sign flag
1970    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1971    */
1972   LLVMValueRef inv = lp_build_const_v4si(bld->gallivm, ~0);
1973   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1974   LLVMValueRef pi32_4 = lp_build_const_v4si(bld->gallivm, 4);
1975   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1976
1977   /*
1978    * emm2 = _mm_slli_epi32(emm0, 29);
1979    */
1980   LLVMValueRef const_29 = lp_build_const_v4si(bld->gallivm, 29);
1981   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1982
1983   /*
1984    * get the polynom selection mask
1985    * there is one polynom for 0 <= x <= Pi/4
1986    * and another one for Pi/4<x<=Pi/2
1987    * Both branches will be computed.
1988    *
1989    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1990    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1991    */
1992
1993   LLVMValueRef pi32_2 = lp_build_const_v4si(bld->gallivm, 2);
1994   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1995   LLVMValueRef poly_mask = lp_build_compare(bld->gallivm,
1996                                             int_type, PIPE_FUNC_EQUAL,
1997   				             emm2_3, lp_build_const_v4si(bld->gallivm, 0));
1998
1999   /*
2000    * _PS_CONST(minus_cephes_DP1, -0.78515625);
2001    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2002    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2003    */
2004   LLVMValueRef DP1 = lp_build_const_v4sf(gallivm, -0.78515625);
2005   LLVMValueRef DP2 = lp_build_const_v4sf(gallivm, -2.4187564849853515625e-4);
2006   LLVMValueRef DP3 = lp_build_const_v4sf(gallivm, -3.77489497744594108e-8);
2007
2008   /*
2009    * The magic pass: "Extended precision modular arithmetic"
2010    * x = ((x - y * DP1) - y * DP2) - y * DP3;
2011    * xmm1 = _mm_mul_ps(y, xmm1);
2012    * xmm2 = _mm_mul_ps(y, xmm2);
2013    * xmm3 = _mm_mul_ps(y, xmm3);
2014    */
2015   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2016   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2017   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2018
2019   /*
2020    * x = _mm_add_ps(x, xmm1);
2021    * x = _mm_add_ps(x, xmm2);
2022    * x = _mm_add_ps(x, xmm3);
2023    */
2024
2025   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2026   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2027   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2028
2029   /*
2030    * Evaluate the first polynom  (0 <= x <= Pi/4)
2031    *
2032    * z = _mm_mul_ps(x,x);
2033    */
2034   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2035
2036   /*
2037    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2038    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2039    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2040    */
2041   LLVMValueRef coscof_p0 = lp_build_const_v4sf(gallivm, 2.443315711809948E-005);
2042   LLVMValueRef coscof_p1 = lp_build_const_v4sf(gallivm, -1.388731625493765E-003);
2043   LLVMValueRef coscof_p2 = lp_build_const_v4sf(gallivm, 4.166664568298827E-002);
2044
2045   /*
2046    * y = *(v4sf*)_ps_coscof_p0;
2047    * y = _mm_mul_ps(y, z);
2048    */
2049   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2050   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2051   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2052   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2053   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2054   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2055
2056
2057   /*
2058    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2059    * y = _mm_sub_ps(y, tmp);
2060    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2061    */
2062   LLVMValueRef half = lp_build_const_v4sf(gallivm, 0.5);
2063   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2064   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2065   LLVMValueRef one = lp_build_const_v4sf(gallivm, 1.0);
2066   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2067
2068   /*
2069    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2070    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2071    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2072    */
2073   LLVMValueRef sincof_p0 = lp_build_const_v4sf(gallivm, -1.9515295891E-4);
2074   LLVMValueRef sincof_p1 = lp_build_const_v4sf(gallivm, 8.3321608736E-3);
2075   LLVMValueRef sincof_p2 = lp_build_const_v4sf(gallivm, -1.6666654611E-1);
2076
2077   /*
2078    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2079    *
2080    * y2 = *(v4sf*)_ps_sincof_p0;
2081    * y2 = _mm_mul_ps(y2, z);
2082    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2083    * y2 = _mm_mul_ps(y2, z);
2084    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2085    * y2 = _mm_mul_ps(y2, z);
2086    * y2 = _mm_mul_ps(y2, x);
2087    * y2 = _mm_add_ps(y2, x);
2088    */
2089
2090   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2091   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2092   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2093   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2094   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2095   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2096   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2097
2098   /*
2099    * select the correct result from the two polynoms
2100    * xmm3 = poly_mask;
2101    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2102    * y = _mm_andnot_ps(xmm3, y);
2103    * y = _mm_add_ps(y,y2);
2104    */
2105   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
2106   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
2107   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2108   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2109   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2110   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2111
2112   /*
2113    * update the sign
2114    * y = _mm_xor_ps(y, sign_bit);
2115    */
2116   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2117   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
2118   return y_result;
2119}
2120
2121
2122/**
2123 * Generate pow(x, y)
2124 */
2125LLVMValueRef
2126lp_build_pow(struct lp_build_context *bld,
2127             LLVMValueRef x,
2128             LLVMValueRef y)
2129{
2130   /* TODO: optimize the constant case */
2131   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2132       LLVMIsConstant(x) && LLVMIsConstant(y)) {
2133      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2134                   __FUNCTION__);
2135   }
2136
2137   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2138}
2139
2140
2141/**
2142 * Generate exp(x)
2143 */
2144LLVMValueRef
2145lp_build_exp(struct lp_build_context *bld,
2146             LLVMValueRef x)
2147{
2148   /* log2(e) = 1/log(2) */
2149   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2150                                           1.4426950408889634);
2151
2152   assert(lp_check_value(bld->type, x));
2153
2154   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
2155}
2156
2157
2158/**
2159 * Generate log(x)
2160 */
2161LLVMValueRef
2162lp_build_log(struct lp_build_context *bld,
2163             LLVMValueRef x)
2164{
2165   /* log(2) */
2166   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2167                                          0.69314718055994529);
2168
2169   assert(lp_check_value(bld->type, x));
2170
2171   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
2172}
2173
2174
2175/**
2176 * Generate polynomial.
2177 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2178 */
2179static LLVMValueRef
2180lp_build_polynomial(struct lp_build_context *bld,
2181                    LLVMValueRef x,
2182                    const double *coeffs,
2183                    unsigned num_coeffs)
2184{
2185   const struct lp_type type = bld->type;
2186   LLVMValueRef res = NULL;
2187   unsigned i;
2188
2189   assert(lp_check_value(bld->type, x));
2190
2191   /* TODO: optimize the constant case */
2192   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2193       LLVMIsConstant(x)) {
2194      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2195                   __FUNCTION__);
2196   }
2197
2198   for (i = num_coeffs; i--; ) {
2199      LLVMValueRef coeff;
2200
2201      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2202
2203      if(res)
2204         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2205      else
2206         res = coeff;
2207   }
2208
2209   if(res)
2210      return res;
2211   else
2212      return bld->undef;
2213}
2214
2215
2216/**
2217 * Minimax polynomial fit of 2**x, in range [0, 1[
2218 */
2219const double lp_build_exp2_polynomial[] = {
2220#if EXP_POLY_DEGREE == 5
2221   0.999999999690134838155,
2222   0.583974334321735217258,
2223   0.164553105719676828492,
2224   0.0292811063701710962255,
2225   0.00354944426657875141846,
2226   0.000296253726543423377365
2227#elif EXP_POLY_DEGREE == 4
2228   1.00000001502262084505,
2229   0.563586057338685991394,
2230   0.150436017652442413623,
2231   0.0243220604213317927308,
2232   0.0025359088446580436489
2233#elif EXP_POLY_DEGREE == 3
2234   0.999925218562710312959,
2235   0.695833540494823811697,
2236   0.226067155427249155588,
2237   0.0780245226406372992967
2238#elif EXP_POLY_DEGREE == 2
2239   1.00172476321474503578,
2240   0.657636275736077639316,
2241   0.33718943461968720704
2242#else
2243#error
2244#endif
2245};
2246
2247
2248void
2249lp_build_exp2_approx(struct lp_build_context *bld,
2250                     LLVMValueRef x,
2251                     LLVMValueRef *p_exp2_int_part,
2252                     LLVMValueRef *p_frac_part,
2253                     LLVMValueRef *p_exp2)
2254{
2255   LLVMBuilderRef builder = bld->gallivm->builder;
2256   const struct lp_type type = bld->type;
2257   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2258   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2259   LLVMValueRef ipart = NULL;
2260   LLVMValueRef fpart = NULL;
2261   LLVMValueRef expipart = NULL;
2262   LLVMValueRef expfpart = NULL;
2263   LLVMValueRef res = NULL;
2264
2265   assert(lp_check_value(bld->type, x));
2266
2267   if(p_exp2_int_part || p_frac_part || p_exp2) {
2268      /* TODO: optimize the constant case */
2269      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2270          LLVMIsConstant(x)) {
2271         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2272                      __FUNCTION__);
2273      }
2274
2275      assert(type.floating && type.width == 32);
2276
2277      x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2278      x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2279
2280      /* ipart = floor(x) */
2281      ipart = lp_build_floor(bld, x);
2282
2283      /* fpart = x - ipart */
2284      fpart = LLVMBuildFSub(builder, x, ipart, "");
2285   }
2286
2287   if(p_exp2_int_part || p_exp2) {
2288      /* expipart = (float) (1 << ipart) */
2289      ipart = LLVMBuildFPToSI(builder, ipart, int_vec_type, "");
2290      expipart = LLVMBuildAdd(builder, ipart,
2291                              lp_build_const_int_vec(bld->gallivm, type, 127), "");
2292      expipart = LLVMBuildShl(builder, expipart,
2293                              lp_build_const_int_vec(bld->gallivm, type, 23), "");
2294      expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2295   }
2296
2297   if(p_exp2) {
2298      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2299                                     Elements(lp_build_exp2_polynomial));
2300
2301      res = LLVMBuildFMul(builder, expipart, expfpart, "");
2302   }
2303
2304   if(p_exp2_int_part)
2305      *p_exp2_int_part = expipart;
2306
2307   if(p_frac_part)
2308      *p_frac_part = fpart;
2309
2310   if(p_exp2)
2311      *p_exp2 = res;
2312}
2313
2314
2315LLVMValueRef
2316lp_build_exp2(struct lp_build_context *bld,
2317              LLVMValueRef x)
2318{
2319   LLVMValueRef res;
2320   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2321   return res;
2322}
2323
2324
2325/**
2326 * Extract the exponent of a IEEE-754 floating point value.
2327 *
2328 * Optionally apply an integer bias.
2329 *
2330 * Result is an integer value with
2331 *
2332 *   ifloor(log2(x)) + bias
2333 */
2334LLVMValueRef
2335lp_build_extract_exponent(struct lp_build_context *bld,
2336                          LLVMValueRef x,
2337                          int bias)
2338{
2339   LLVMBuilderRef builder = bld->gallivm->builder;
2340   const struct lp_type type = bld->type;
2341   unsigned mantissa = lp_mantissa(type);
2342   LLVMValueRef res;
2343
2344   assert(type.floating);
2345
2346   assert(lp_check_value(bld->type, x));
2347
2348   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2349
2350   res = LLVMBuildLShr(builder, x,
2351                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2352   res = LLVMBuildAnd(builder, res,
2353                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
2354   res = LLVMBuildSub(builder, res,
2355                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2356
2357   return res;
2358}
2359
2360
2361/**
2362 * Extract the mantissa of the a floating.
2363 *
2364 * Result is a floating point value with
2365 *
2366 *   x / floor(log2(x))
2367 */
2368LLVMValueRef
2369lp_build_extract_mantissa(struct lp_build_context *bld,
2370                          LLVMValueRef x)
2371{
2372   LLVMBuilderRef builder = bld->gallivm->builder;
2373   const struct lp_type type = bld->type;
2374   unsigned mantissa = lp_mantissa(type);
2375   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2376                                                  (1ULL << mantissa) - 1);
2377   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2378   LLVMValueRef res;
2379
2380   assert(lp_check_value(bld->type, x));
2381
2382   assert(type.floating);
2383
2384   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2385
2386   /* res = x / 2**ipart */
2387   res = LLVMBuildAnd(builder, x, mantmask, "");
2388   res = LLVMBuildOr(builder, res, one, "");
2389   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2390
2391   return res;
2392}
2393
2394
2395
2396/**
2397 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2398 * These coefficients can be generate with
2399 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2400 */
2401const double lp_build_log2_polynomial[] = {
2402#if LOG_POLY_DEGREE == 6
2403   3.11578814719469302614,
2404   -3.32419399085241980044,
2405   2.59883907202499966007,
2406   -1.23152682416275988241,
2407   0.318212422185251071475,
2408   -0.0344359067839062357313
2409#elif LOG_POLY_DEGREE == 5
2410   2.8882704548164776201,
2411   -2.52074962577807006663,
2412   1.48116647521213171641,
2413   -0.465725644288844778798,
2414   0.0596515482674574969533
2415#elif LOG_POLY_DEGREE == 4
2416   2.61761038894603480148,
2417   -1.75647175389045657003,
2418   0.688243882994381274313,
2419   -0.107254423828329604454
2420#elif LOG_POLY_DEGREE == 3
2421   2.28330284476918490682,
2422   -1.04913055217340124191,
2423   0.204446009836232697516
2424#else
2425#error
2426#endif
2427};
2428
2429
2430/**
2431 * See http://www.devmaster.net/forums/showthread.php?p=43580
2432 */
2433void
2434lp_build_log2_approx(struct lp_build_context *bld,
2435                     LLVMValueRef x,
2436                     LLVMValueRef *p_exp,
2437                     LLVMValueRef *p_floor_log2,
2438                     LLVMValueRef *p_log2)
2439{
2440   LLVMBuilderRef builder = bld->gallivm->builder;
2441   const struct lp_type type = bld->type;
2442   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2443   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2444
2445   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2446   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2447   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2448
2449   LLVMValueRef i = NULL;
2450   LLVMValueRef exp = NULL;
2451   LLVMValueRef mant = NULL;
2452   LLVMValueRef logexp = NULL;
2453   LLVMValueRef logmant = NULL;
2454   LLVMValueRef res = NULL;
2455
2456   assert(lp_check_value(bld->type, x));
2457
2458   if(p_exp || p_floor_log2 || p_log2) {
2459      /* TODO: optimize the constant case */
2460      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2461          LLVMIsConstant(x)) {
2462         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2463                      __FUNCTION__);
2464      }
2465
2466      assert(type.floating && type.width == 32);
2467
2468      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2469
2470      /* exp = (float) exponent(x) */
2471      exp = LLVMBuildAnd(builder, i, expmask, "");
2472   }
2473
2474   if(p_floor_log2 || p_log2) {
2475      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2476      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2477      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2478   }
2479
2480   if(p_log2) {
2481      /* mant = (float) mantissa(x) */
2482      mant = LLVMBuildAnd(builder, i, mantmask, "");
2483      mant = LLVMBuildOr(builder, mant, one, "");
2484      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2485
2486      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2487                                    Elements(lp_build_log2_polynomial));
2488
2489      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2490      logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2491
2492      res = LLVMBuildFAdd(builder, logmant, logexp, "");
2493   }
2494
2495   if(p_exp) {
2496      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2497      *p_exp = exp;
2498   }
2499
2500   if(p_floor_log2)
2501      *p_floor_log2 = logexp;
2502
2503   if(p_log2)
2504      *p_log2 = res;
2505}
2506
2507
2508LLVMValueRef
2509lp_build_log2(struct lp_build_context *bld,
2510              LLVMValueRef x)
2511{
2512   LLVMValueRef res;
2513   lp_build_log2_approx(bld, x, NULL, NULL, &res);
2514   return res;
2515}
2516
2517
2518/**
2519 * Faster (and less accurate) log2.
2520 *
2521 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2522 *
2523 * Piece-wise linear approximation, with exact results when x is a
2524 * power of two.
2525 *
2526 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2527 */
2528LLVMValueRef
2529lp_build_fast_log2(struct lp_build_context *bld,
2530                   LLVMValueRef x)
2531{
2532   LLVMBuilderRef builder = bld->gallivm->builder;
2533   LLVMValueRef ipart;
2534   LLVMValueRef fpart;
2535
2536   assert(lp_check_value(bld->type, x));
2537
2538   assert(bld->type.floating);
2539
2540   /* ipart = floor(log2(x)) - 1 */
2541   ipart = lp_build_extract_exponent(bld, x, -1);
2542   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2543
2544   /* fpart = x / 2**ipart */
2545   fpart = lp_build_extract_mantissa(bld, x);
2546
2547   /* ipart + fpart */
2548   return LLVMBuildFAdd(builder, ipart, fpart, "");
2549}
2550
2551
2552/**
2553 * Fast implementation of iround(log2(x)).
2554 *
2555 * Not an approximation -- it should give accurate results all the time.
2556 */
2557LLVMValueRef
2558lp_build_ilog2(struct lp_build_context *bld,
2559               LLVMValueRef x)
2560{
2561   LLVMBuilderRef builder = bld->gallivm->builder;
2562   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2563   LLVMValueRef ipart;
2564
2565   assert(bld->type.floating);
2566
2567   assert(lp_check_value(bld->type, x));
2568
2569   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2570   x = LLVMBuildFMul(builder, x, sqrt2, "");
2571
2572   /* ipart = floor(log2(x) + 0.5)  */
2573   ipart = lp_build_extract_exponent(bld, x, 0);
2574
2575   return ipart;
2576}
2577