lp_bld_arit.c revision 533ec3f667d36ba2aea564ff047a8f55be13f6e9
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_intr.h"
57#include "lp_bld_init.h" /* for lp_build_engine */
58#include "lp_bld_logic.h"
59#include "lp_bld_pack.h"
60#include "lp_bld_debug.h"
61#include "lp_bld_arit.h"
62
63
64/**
65 * Generate min(a, b)
66 * No checks for special case values of a or b = 1 or 0 are done.
67 */
68static LLVMValueRef
69lp_build_min_simple(struct lp_build_context *bld,
70                    LLVMValueRef a,
71                    LLVMValueRef b)
72{
73   const struct lp_type type = bld->type;
74   const char *intrinsic = NULL;
75   LLVMValueRef cond;
76
77   /* TODO: optimize the constant case */
78
79   if(type.width * type.length == 128) {
80      if(type.floating) {
81         if(type.width == 32 && util_cpu_caps.has_sse)
82            intrinsic = "llvm.x86.sse.min.ps";
83         if(type.width == 64 && util_cpu_caps.has_sse2)
84            intrinsic = "llvm.x86.sse2.min.pd";
85      }
86      else {
87         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
88            intrinsic = "llvm.x86.sse2.pminu.b";
89         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
90            intrinsic = "llvm.x86.sse41.pminsb";
91         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
92            intrinsic = "llvm.x86.sse41.pminuw";
93         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
94            intrinsic = "llvm.x86.sse2.pmins.w";
95         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
96            intrinsic = "llvm.x86.sse41.pminud";
97         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
98            intrinsic = "llvm.x86.sse41.pminsd";
99      }
100   }
101
102   if(intrinsic)
103      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
104
105   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
106   return lp_build_select(bld, cond, a, b);
107}
108
109
110/**
111 * Generate max(a, b)
112 * No checks for special case values of a or b = 1 or 0 are done.
113 */
114static LLVMValueRef
115lp_build_max_simple(struct lp_build_context *bld,
116                    LLVMValueRef a,
117                    LLVMValueRef b)
118{
119   const struct lp_type type = bld->type;
120   const char *intrinsic = NULL;
121   LLVMValueRef cond;
122
123   /* TODO: optimize the constant case */
124
125   if(type.width * type.length == 128) {
126      if(type.floating) {
127         if(type.width == 32 && util_cpu_caps.has_sse)
128            intrinsic = "llvm.x86.sse.max.ps";
129         if(type.width == 64 && util_cpu_caps.has_sse2)
130            intrinsic = "llvm.x86.sse2.max.pd";
131      }
132      else {
133         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
134            intrinsic = "llvm.x86.sse2.pmaxu.b";
135         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
136            intrinsic = "llvm.x86.sse41.pmaxsb";
137         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
138            intrinsic = "llvm.x86.sse41.pmaxuw";
139         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
140            intrinsic = "llvm.x86.sse2.pmaxs.w";
141         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
142            intrinsic = "llvm.x86.sse41.pmaxud";
143         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
144            intrinsic = "llvm.x86.sse41.pmaxsd";
145      }
146   }
147
148   if(intrinsic)
149      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
150
151   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
152   return lp_build_select(bld, cond, a, b);
153}
154
155
156/**
157 * Generate 1 - a, or ~a depending on bld->type.
158 */
159LLVMValueRef
160lp_build_comp(struct lp_build_context *bld,
161              LLVMValueRef a)
162{
163   const struct lp_type type = bld->type;
164
165   if(a == bld->one)
166      return bld->zero;
167   if(a == bld->zero)
168      return bld->one;
169
170   if(type.norm && !type.floating && !type.fixed && !type.sign) {
171      if(LLVMIsConstant(a))
172         return LLVMConstNot(a);
173      else
174         return LLVMBuildNot(bld->builder, a, "");
175   }
176
177   if(LLVMIsConstant(a))
178      return LLVMConstSub(bld->one, a);
179   else
180      return LLVMBuildSub(bld->builder, bld->one, a, "");
181}
182
183
184/**
185 * Generate a + b
186 */
187LLVMValueRef
188lp_build_add(struct lp_build_context *bld,
189             LLVMValueRef a,
190             LLVMValueRef b)
191{
192   const struct lp_type type = bld->type;
193   LLVMValueRef res;
194
195   if(a == bld->zero)
196      return b;
197   if(b == bld->zero)
198      return a;
199   if(a == bld->undef || b == bld->undef)
200      return bld->undef;
201
202   if(bld->type.norm) {
203      const char *intrinsic = NULL;
204
205      if(a == bld->one || b == bld->one)
206        return bld->one;
207
208      if(util_cpu_caps.has_sse2 &&
209         type.width * type.length == 128 &&
210         !type.floating && !type.fixed) {
211         if(type.width == 8)
212            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
213         if(type.width == 16)
214            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
215      }
216
217      if(intrinsic)
218         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
219   }
220
221   if(LLVMIsConstant(a) && LLVMIsConstant(b))
222      res = LLVMConstAdd(a, b);
223   else
224      res = LLVMBuildAdd(bld->builder, a, b, "");
225
226   /* clamp to ceiling of 1.0 */
227   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
228      res = lp_build_min_simple(bld, res, bld->one);
229
230   /* XXX clamp to floor of -1 or 0??? */
231
232   return res;
233}
234
235
236/** Return the sum of the elements of a */
237LLVMValueRef
238lp_build_sum_vector(struct lp_build_context *bld,
239                    LLVMValueRef a)
240{
241   const struct lp_type type = bld->type;
242   LLVMValueRef index, res;
243   unsigned i;
244
245   if (a == bld->zero)
246      return bld->zero;
247   if (a == bld->undef)
248      return bld->undef;
249   assert(type.length > 1);
250
251   assert(!bld->type.norm);
252
253   index = LLVMConstInt(LLVMInt32Type(), 0, 0);
254   res = LLVMBuildExtractElement(bld->builder, a, index, "");
255
256   for (i = 1; i < type.length; i++) {
257      index = LLVMConstInt(LLVMInt32Type(), i, 0);
258      res = LLVMBuildAdd(bld->builder, res,
259                         LLVMBuildExtractElement(bld->builder, a, index, ""),
260                         "");
261   }
262
263   return res;
264}
265
266
267/**
268 * Generate a - b
269 */
270LLVMValueRef
271lp_build_sub(struct lp_build_context *bld,
272             LLVMValueRef a,
273             LLVMValueRef b)
274{
275   const struct lp_type type = bld->type;
276   LLVMValueRef res;
277
278   if(b == bld->zero)
279      return a;
280   if(a == bld->undef || b == bld->undef)
281      return bld->undef;
282   if(a == b)
283      return bld->zero;
284
285   if(bld->type.norm) {
286      const char *intrinsic = NULL;
287
288      if(b == bld->one)
289        return bld->zero;
290
291      if(util_cpu_caps.has_sse2 &&
292         type.width * type.length == 128 &&
293         !type.floating && !type.fixed) {
294         if(type.width == 8)
295            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
296         if(type.width == 16)
297            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
298      }
299
300      if(intrinsic)
301         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
302   }
303
304   if(LLVMIsConstant(a) && LLVMIsConstant(b))
305      res = LLVMConstSub(a, b);
306   else
307      res = LLVMBuildSub(bld->builder, a, b, "");
308
309   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
310      res = lp_build_max_simple(bld, res, bld->zero);
311
312   return res;
313}
314
315
316/**
317 * Normalized 8bit multiplication.
318 *
319 * - alpha plus one
320 *
321 *     makes the following approximation to the division (Sree)
322 *
323 *       a*b/255 ~= (a*(b + 1)) >> 256
324 *
325 *     which is the fastest method that satisfies the following OpenGL criteria
326 *
327 *       0*0 = 0 and 255*255 = 255
328 *
329 * - geometric series
330 *
331 *     takes the geometric series approximation to the division
332 *
333 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
334 *
335 *     in this case just the first two terms to fit in 16bit arithmetic
336 *
337 *       t/255 ~= (t + (t >> 8)) >> 8
338 *
339 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
340 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
341 *     must be used
342 *
343 * - geometric series plus rounding
344 *
345 *     when using a geometric series division instead of truncating the result
346 *     use roundoff in the approximation (Jim Blinn)
347 *
348 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
349 *
350 *     achieving the exact results
351 *
352 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
353 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
354 * @sa Michael Herf, The "double blend trick", May 2000,
355 *     http://www.stereopsis.com/doubleblend.html
356 */
357static LLVMValueRef
358lp_build_mul_u8n(LLVMBuilderRef builder,
359                 struct lp_type i16_type,
360                 LLVMValueRef a, LLVMValueRef b)
361{
362   LLVMValueRef c8;
363   LLVMValueRef ab;
364
365   c8 = lp_build_const_int_vec(i16_type, 8);
366
367#if 0
368
369   /* a*b/255 ~= (a*(b + 1)) >> 256 */
370   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
371   ab = LLVMBuildMul(builder, a, b, "");
372
373#else
374
375   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
376   ab = LLVMBuildMul(builder, a, b, "");
377   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
378   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
379
380#endif
381
382   ab = LLVMBuildLShr(builder, ab, c8, "");
383
384   return ab;
385}
386
387
388/**
389 * Generate a * b
390 */
391LLVMValueRef
392lp_build_mul(struct lp_build_context *bld,
393             LLVMValueRef a,
394             LLVMValueRef b)
395{
396   const struct lp_type type = bld->type;
397   LLVMValueRef shift;
398   LLVMValueRef res;
399
400   if(a == bld->zero)
401      return bld->zero;
402   if(a == bld->one)
403      return b;
404   if(b == bld->zero)
405      return bld->zero;
406   if(b == bld->one)
407      return a;
408   if(a == bld->undef || b == bld->undef)
409      return bld->undef;
410
411   if(!type.floating && !type.fixed && type.norm) {
412      if(type.width == 8) {
413         struct lp_type i16_type = lp_wider_type(type);
414         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
415
416         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
417         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
418
419         /* PMULLW, PSRLW, PADDW */
420         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
421         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
422
423         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
424
425         return ab;
426      }
427
428      /* FIXME */
429      assert(0);
430   }
431
432   if(type.fixed)
433      shift = lp_build_const_int_vec(type, type.width/2);
434   else
435      shift = NULL;
436
437   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
438      res =  LLVMConstMul(a, b);
439      if(shift) {
440         if(type.sign)
441            res = LLVMConstAShr(res, shift);
442         else
443            res = LLVMConstLShr(res, shift);
444      }
445   }
446   else {
447      res = LLVMBuildMul(bld->builder, a, b, "");
448      if(shift) {
449         if(type.sign)
450            res = LLVMBuildAShr(bld->builder, res, shift, "");
451         else
452            res = LLVMBuildLShr(bld->builder, res, shift, "");
453      }
454   }
455
456   return res;
457}
458
459
460/**
461 * Small vector x scale multiplication optimization.
462 */
463LLVMValueRef
464lp_build_mul_imm(struct lp_build_context *bld,
465                 LLVMValueRef a,
466                 int b)
467{
468   LLVMValueRef factor;
469
470   if(b == 0)
471      return bld->zero;
472
473   if(b == 1)
474      return a;
475
476   if(b == -1)
477      return LLVMBuildNeg(bld->builder, a, "");
478
479   if(b == 2 && bld->type.floating)
480      return lp_build_add(bld, a, a);
481
482   if(util_is_pot(b)) {
483      unsigned shift = ffs(b) - 1;
484
485      if(bld->type.floating) {
486#if 0
487         /*
488          * Power of two multiplication by directly manipulating the mantissa.
489          *
490          * XXX: This might not be always faster, it will introduce a small error
491          * for multiplication by zero, and it will produce wrong results
492          * for Inf and NaN.
493          */
494         unsigned mantissa = lp_mantissa(bld->type);
495         factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
496         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
497         a = LLVMBuildAdd(bld->builder, a, factor, "");
498         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
499         return a;
500#endif
501      }
502      else {
503         factor = lp_build_const_vec(bld->type, shift);
504         return LLVMBuildShl(bld->builder, a, factor, "");
505      }
506   }
507
508   factor = lp_build_const_vec(bld->type, (double)b);
509   return lp_build_mul(bld, a, factor);
510}
511
512
513/**
514 * Generate a / b
515 */
516LLVMValueRef
517lp_build_div(struct lp_build_context *bld,
518             LLVMValueRef a,
519             LLVMValueRef b)
520{
521   const struct lp_type type = bld->type;
522
523   if(a == bld->zero)
524      return bld->zero;
525   if(a == bld->one)
526      return lp_build_rcp(bld, b);
527   if(b == bld->zero)
528      return bld->undef;
529   if(b == bld->one)
530      return a;
531   if(a == bld->undef || b == bld->undef)
532      return bld->undef;
533
534   if(LLVMIsConstant(a) && LLVMIsConstant(b))
535      return LLVMConstFDiv(a, b);
536
537   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
538      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
539
540   return LLVMBuildFDiv(bld->builder, a, b, "");
541}
542
543
544/**
545 * Linear interpolation.
546 *
547 * This also works for integer values with a few caveats.
548 *
549 * @sa http://www.stereopsis.com/doubleblend.html
550 */
551LLVMValueRef
552lp_build_lerp(struct lp_build_context *bld,
553              LLVMValueRef x,
554              LLVMValueRef v0,
555              LLVMValueRef v1)
556{
557   LLVMValueRef delta;
558   LLVMValueRef res;
559
560   delta = lp_build_sub(bld, v1, v0);
561
562   res = lp_build_mul(bld, x, delta);
563
564   res = lp_build_add(bld, v0, res);
565
566   if(bld->type.fixed)
567      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
568       * but it will be wrong for other uses. Basically we need a more
569       * powerful lp_type, capable of further distinguishing the values
570       * interpretation from the value storage. */
571      res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
572
573   return res;
574}
575
576
577LLVMValueRef
578lp_build_lerp_2d(struct lp_build_context *bld,
579                 LLVMValueRef x,
580                 LLVMValueRef y,
581                 LLVMValueRef v00,
582                 LLVMValueRef v01,
583                 LLVMValueRef v10,
584                 LLVMValueRef v11)
585{
586   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
587   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
588   return lp_build_lerp(bld, y, v0, v1);
589}
590
591
592/**
593 * Generate min(a, b)
594 * Do checks for special cases.
595 */
596LLVMValueRef
597lp_build_min(struct lp_build_context *bld,
598             LLVMValueRef a,
599             LLVMValueRef b)
600{
601   if(a == bld->undef || b == bld->undef)
602      return bld->undef;
603
604   if(a == b)
605      return a;
606
607   if(bld->type.norm) {
608      if(a == bld->zero || b == bld->zero)
609         return bld->zero;
610      if(a == bld->one)
611         return b;
612      if(b == bld->one)
613         return a;
614   }
615
616   return lp_build_min_simple(bld, a, b);
617}
618
619
620/**
621 * Generate max(a, b)
622 * Do checks for special cases.
623 */
624LLVMValueRef
625lp_build_max(struct lp_build_context *bld,
626             LLVMValueRef a,
627             LLVMValueRef b)
628{
629   if(a == bld->undef || b == bld->undef)
630      return bld->undef;
631
632   if(a == b)
633      return a;
634
635   if(bld->type.norm) {
636      if(a == bld->one || b == bld->one)
637         return bld->one;
638      if(a == bld->zero)
639         return b;
640      if(b == bld->zero)
641         return a;
642   }
643
644   return lp_build_max_simple(bld, a, b);
645}
646
647
648/**
649 * Generate clamp(a, min, max)
650 * Do checks for special cases.
651 */
652LLVMValueRef
653lp_build_clamp(struct lp_build_context *bld,
654               LLVMValueRef a,
655               LLVMValueRef min,
656               LLVMValueRef max)
657{
658   a = lp_build_min(bld, a, max);
659   a = lp_build_max(bld, a, min);
660   return a;
661}
662
663
664/**
665 * Generate abs(a)
666 */
667LLVMValueRef
668lp_build_abs(struct lp_build_context *bld,
669             LLVMValueRef a)
670{
671   const struct lp_type type = bld->type;
672   LLVMTypeRef vec_type = lp_build_vec_type(type);
673
674   if(!type.sign)
675      return a;
676
677   if(type.floating) {
678      /* Mask out the sign bit */
679      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
680      unsigned long long absMask = ~(1ULL << (type.width - 1));
681      LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
682      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
683      a = LLVMBuildAnd(bld->builder, a, mask, "");
684      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
685      return a;
686   }
687
688   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
689      switch(type.width) {
690      case 8:
691         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
692      case 16:
693         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
694      case 32:
695         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
696      }
697   }
698
699   return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
700}
701
702
703LLVMValueRef
704lp_build_negate(struct lp_build_context *bld,
705                LLVMValueRef a)
706{
707   return LLVMBuildNeg(bld->builder, a, "");
708}
709
710
711/** Return -1, 0 or +1 depending on the sign of a */
712LLVMValueRef
713lp_build_sgn(struct lp_build_context *bld,
714             LLVMValueRef a)
715{
716   const struct lp_type type = bld->type;
717   LLVMValueRef cond;
718   LLVMValueRef res;
719
720   /* Handle non-zero case */
721   if(!type.sign) {
722      /* if not zero then sign must be positive */
723      res = bld->one;
724   }
725   else if(type.floating) {
726      LLVMTypeRef vec_type;
727      LLVMTypeRef int_type;
728      LLVMValueRef mask;
729      LLVMValueRef sign;
730      LLVMValueRef one;
731      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
732
733      int_type = lp_build_int_vec_type(type);
734      vec_type = lp_build_vec_type(type);
735      mask = lp_build_const_int_vec(type, maskBit);
736
737      /* Take the sign bit and add it to 1 constant */
738      sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
739      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
740      one = LLVMConstBitCast(bld->one, int_type);
741      res = LLVMBuildOr(bld->builder, sign, one, "");
742      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
743   }
744   else
745   {
746      LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
747      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
748      res = lp_build_select(bld, cond, bld->one, minus_one);
749   }
750
751   /* Handle zero */
752   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
753   res = lp_build_select(bld, cond, bld->zero, res);
754
755   return res;
756}
757
758
759/**
760 * Set the sign of float vector 'a' according to 'sign'.
761 * If sign==0, return abs(a).
762 * If sign==1, return -abs(a);
763 * Other values for sign produce undefined results.
764 */
765LLVMValueRef
766lp_build_set_sign(struct lp_build_context *bld,
767                  LLVMValueRef a, LLVMValueRef sign)
768{
769   const struct lp_type type = bld->type;
770   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
771   LLVMTypeRef vec_type = lp_build_vec_type(type);
772   LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
773   LLVMValueRef mask = lp_build_const_int_vec(type,
774                             ~((unsigned long long) 1 << (type.width - 1)));
775   LLVMValueRef val, res;
776
777   assert(type.floating);
778
779   /* val = reinterpret_cast<int>(a) */
780   val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
781   /* val = val & mask */
782   val = LLVMBuildAnd(bld->builder, val, mask, "");
783   /* sign = sign << shift */
784   sign = LLVMBuildShl(bld->builder, sign, shift, "");
785   /* res = val | sign */
786   res = LLVMBuildOr(bld->builder, val, sign, "");
787   /* res = reinterpret_cast<float>(res) */
788   res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
789
790   return res;
791}
792
793
794/**
795 * Convert vector of (or scalar) int to vector of (or scalar) float.
796 */
797LLVMValueRef
798lp_build_int_to_float(struct lp_build_context *bld,
799                      LLVMValueRef a)
800{
801   const struct lp_type type = bld->type;
802   LLVMTypeRef vec_type = lp_build_vec_type(type);
803
804   assert(type.floating);
805
806   return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
807}
808
809
810
811enum lp_build_round_sse41_mode
812{
813   LP_BUILD_ROUND_SSE41_NEAREST = 0,
814   LP_BUILD_ROUND_SSE41_FLOOR = 1,
815   LP_BUILD_ROUND_SSE41_CEIL = 2,
816   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
817};
818
819
820static INLINE LLVMValueRef
821lp_build_round_sse41(struct lp_build_context *bld,
822                     LLVMValueRef a,
823                     enum lp_build_round_sse41_mode mode)
824{
825   const struct lp_type type = bld->type;
826   LLVMTypeRef vec_type = lp_build_vec_type(type);
827   const char *intrinsic;
828
829   assert(type.floating);
830   assert(type.width*type.length == 128);
831   assert(lp_check_value(type, a));
832   assert(util_cpu_caps.has_sse4_1);
833
834   switch(type.width) {
835   case 32:
836      intrinsic = "llvm.x86.sse41.round.ps";
837      break;
838   case 64:
839      intrinsic = "llvm.x86.sse41.round.pd";
840      break;
841   default:
842      assert(0);
843      return bld->undef;
844   }
845
846   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
847                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
848}
849
850
851LLVMValueRef
852lp_build_trunc(struct lp_build_context *bld,
853               LLVMValueRef a)
854{
855   const struct lp_type type = bld->type;
856
857   assert(type.floating);
858   assert(lp_check_value(type, a));
859
860   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
861      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
862   else {
863      LLVMTypeRef vec_type = lp_build_vec_type(type);
864      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
865      LLVMValueRef res;
866      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
867      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
868      return res;
869   }
870}
871
872
873LLVMValueRef
874lp_build_round(struct lp_build_context *bld,
875               LLVMValueRef a)
876{
877   const struct lp_type type = bld->type;
878
879   assert(type.floating);
880   assert(lp_check_value(type, a));
881
882   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
883      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
884   else {
885      LLVMTypeRef vec_type = lp_build_vec_type(type);
886      LLVMValueRef res;
887      res = lp_build_iround(bld, a);
888      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
889      return res;
890   }
891}
892
893
894LLVMValueRef
895lp_build_floor(struct lp_build_context *bld,
896               LLVMValueRef a)
897{
898   const struct lp_type type = bld->type;
899
900   assert(type.floating);
901   assert(lp_check_value(type, a));
902
903   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
904      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
905   else {
906      LLVMTypeRef vec_type = lp_build_vec_type(type);
907      LLVMValueRef res;
908      res = lp_build_ifloor(bld, a);
909      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
910      return res;
911   }
912}
913
914
915LLVMValueRef
916lp_build_ceil(struct lp_build_context *bld,
917              LLVMValueRef a)
918{
919   const struct lp_type type = bld->type;
920
921   assert(type.floating);
922   assert(lp_check_value(type, a));
923
924   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
925      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
926   else {
927      LLVMTypeRef vec_type = lp_build_vec_type(type);
928      LLVMValueRef res;
929      res = lp_build_iceil(bld, a);
930      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
931      return res;
932   }
933}
934
935
936/**
937 * Return fractional part of 'a' computed as a - floor(f)
938 * Typically used in texture coord arithmetic.
939 */
940LLVMValueRef
941lp_build_fract(struct lp_build_context *bld,
942               LLVMValueRef a)
943{
944   assert(bld->type.floating);
945   return lp_build_sub(bld, a, lp_build_floor(bld, a));
946}
947
948
949/**
950 * Convert to integer, through whichever rounding method that's fastest,
951 * typically truncating toward zero.
952 */
953LLVMValueRef
954lp_build_itrunc(struct lp_build_context *bld,
955                LLVMValueRef a)
956{
957   const struct lp_type type = bld->type;
958   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
959
960   assert(type.floating);
961   assert(lp_check_value(type, a));
962
963   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
964}
965
966
967/**
968 * Convert float[] to int[] with round().
969 */
970LLVMValueRef
971lp_build_iround(struct lp_build_context *bld,
972                LLVMValueRef a)
973{
974   const struct lp_type type = bld->type;
975   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
976   LLVMValueRef res;
977
978   assert(type.floating);
979
980   assert(lp_check_value(type, a));
981
982   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
983      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
984   }
985   else {
986      LLVMTypeRef vec_type = lp_build_vec_type(type);
987      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
988      LLVMValueRef sign;
989      LLVMValueRef half;
990
991      /* get sign bit */
992      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
993      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
994
995      /* sign * 0.5 */
996      half = lp_build_const_vec(type, 0.5);
997      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
998      half = LLVMBuildOr(bld->builder, sign, half, "");
999      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1000
1001      res = LLVMBuildAdd(bld->builder, a, half, "");
1002   }
1003
1004   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1005
1006   return res;
1007}
1008
1009
1010/**
1011 * Convert float[] to int[] with floor().
1012 */
1013LLVMValueRef
1014lp_build_ifloor(struct lp_build_context *bld,
1015                LLVMValueRef a)
1016{
1017   const struct lp_type type = bld->type;
1018   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1019   LLVMValueRef res;
1020
1021   assert(type.floating);
1022   assert(lp_check_value(type, a));
1023
1024   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1025      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1026   }
1027   else {
1028      /* Take the sign bit and add it to 1 constant */
1029      LLVMTypeRef vec_type = lp_build_vec_type(type);
1030      unsigned mantissa = lp_mantissa(type);
1031      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1032      LLVMValueRef sign;
1033      LLVMValueRef offset;
1034
1035      /* sign = a < 0 ? ~0 : 0 */
1036      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1037      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1038      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
1039      lp_build_name(sign, "floor.sign");
1040
1041      /* offset = -0.99999(9)f */
1042      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
1043      offset = LLVMConstBitCast(offset, int_vec_type);
1044
1045      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1046      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1047      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
1048      lp_build_name(offset, "floor.offset");
1049
1050      res = LLVMBuildAdd(bld->builder, a, offset, "");
1051      lp_build_name(res, "floor.res");
1052   }
1053
1054   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1055   lp_build_name(res, "floor");
1056
1057   return res;
1058}
1059
1060
1061LLVMValueRef
1062lp_build_iceil(struct lp_build_context *bld,
1063               LLVMValueRef a)
1064{
1065   const struct lp_type type = bld->type;
1066   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1067   LLVMValueRef res;
1068
1069   assert(type.floating);
1070   assert(lp_check_value(type, a));
1071
1072   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1073      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1074   }
1075   else {
1076      /* TODO: mimic lp_build_ifloor() here */
1077      assert(0);
1078      res = bld->undef;
1079   }
1080
1081   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1082
1083   return res;
1084}
1085
1086
1087LLVMValueRef
1088lp_build_sqrt(struct lp_build_context *bld,
1089              LLVMValueRef a)
1090{
1091   const struct lp_type type = bld->type;
1092   LLVMTypeRef vec_type = lp_build_vec_type(type);
1093   char intrinsic[32];
1094
1095   /* TODO: optimize the constant case */
1096   /* TODO: optimize the constant case */
1097
1098   assert(type.floating);
1099   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1100
1101   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1102}
1103
1104
1105LLVMValueRef
1106lp_build_rcp(struct lp_build_context *bld,
1107             LLVMValueRef a)
1108{
1109   const struct lp_type type = bld->type;
1110
1111   if(a == bld->zero)
1112      return bld->undef;
1113   if(a == bld->one)
1114      return bld->one;
1115   if(a == bld->undef)
1116      return bld->undef;
1117
1118   assert(type.floating);
1119
1120   if(LLVMIsConstant(a))
1121      return LLVMConstFDiv(bld->one, a);
1122
1123   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1124      /*
1125       * XXX: Added precision is not always necessary, so only enable this
1126       * when we have a better system in place to track minimum precision.
1127       */
1128
1129#if 0
1130      /*
1131       * Do one Newton-Raphson step to improve precision:
1132       *
1133       *   x1 = (2 - a * rcp(a)) * rcp(a)
1134       */
1135
1136      LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1137      LLVMValueRef rcp_a;
1138      LLVMValueRef res;
1139
1140      rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1141
1142      res = LLVMBuildMul(bld->builder, a, rcp_a, "");
1143      res = LLVMBuildSub(bld->builder, two, res, "");
1144      res = LLVMBuildMul(bld->builder, res, rcp_a, "");
1145
1146      return rcp_a;
1147#else
1148      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1149#endif
1150   }
1151
1152   return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1153}
1154
1155
1156/**
1157 * Generate 1/sqrt(a)
1158 */
1159LLVMValueRef
1160lp_build_rsqrt(struct lp_build_context *bld,
1161               LLVMValueRef a)
1162{
1163   const struct lp_type type = bld->type;
1164
1165   assert(type.floating);
1166
1167   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1168      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1169
1170   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1171}
1172
1173
1174#ifdef PIPE_OS_WINDOWS
1175
1176/*
1177 * XXX: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
1178 * which is neither efficient nor does the CRT linkage work on Windows
1179 * causing segmentation fault.
1180 *
1181 * XXX: With LLVM 2.7 both schemes cause an assertion failure.
1182 */
1183static LLVMValueRef
1184lp_build_sincos(struct lp_build_context *bld,
1185                const char *name,
1186                float (*func)(float),
1187                LLVMValueRef a)
1188{
1189   LLVMModuleRef module =
1190         LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld->builder)));
1191   LLVMValueRef function;
1192   LLVMValueRef res;
1193   unsigned i;
1194
1195   assert(bld->type.floating);
1196   assert(bld->type.width == 32);
1197
1198   function = LLVMGetNamedFunction(module, name);
1199   if (!function) {
1200      LLVMTypeRef ret_type;
1201      LLVMTypeRef arg_types[1];
1202      LLVMTypeRef function_type;
1203
1204      ret_type = LLVMFloatType();
1205      arg_types[0] = LLVMFloatType();
1206      function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
1207      function = LLVMAddFunction(module, name, function_type);
1208
1209      LLVMSetFunctionCallConv(function, LLVMCCallConv);
1210      LLVMSetLinkage(function, LLVMPrivateLinkage);
1211
1212      assert(LLVMIsDeclaration(function));
1213
1214      LLVMAddGlobalMapping(lp_build_engine, function, func);
1215   }
1216
1217   res = bld->undef;
1218
1219   for (i = 0; i < bld->type.length; ++i) {
1220      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
1221      LLVMValueRef args[1];
1222      LLVMValueRef tmp;
1223
1224      args[0] = LLVMBuildExtractElement(bld->builder, a, index, "");
1225
1226      tmp = LLVMBuildCall(bld->builder, function, args, Elements(args), "");
1227
1228      res = LLVMBuildInsertElement(bld->builder, res, tmp, index, "");
1229   }
1230
1231   return res;
1232}
1233
1234static float c_cosf( float f )
1235{
1236   return (float) cos( (double) f );
1237}
1238
1239static float c_sinf( float f )
1240{
1241   return (float) sin( (double) f );
1242}
1243
1244LLVMValueRef
1245lp_build_cos(struct lp_build_context *bld,
1246             LLVMValueRef a)
1247{
1248   return lp_build_sincos(bld, "cosf", &c_cosf, a);
1249}
1250
1251LLVMValueRef
1252lp_build_sin(struct lp_build_context *bld,
1253             LLVMValueRef a)
1254{
1255   return lp_build_sincos(bld, "sinf", &c_sinf, a);
1256}
1257
1258#else /* !PIPE_OS_WINDOWS */
1259
1260/**
1261 * Generate cos(a)
1262 */
1263LLVMValueRef
1264lp_build_cos(struct lp_build_context *bld,
1265              LLVMValueRef a)
1266{
1267   const struct lp_type type = bld->type;
1268   LLVMTypeRef vec_type = lp_build_vec_type(type);
1269   char intrinsic[32];
1270
1271   /* TODO: optimize the constant case */
1272
1273   assert(type.floating);
1274   util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
1275
1276   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1277}
1278
1279
1280/**
1281 * Generate sin(a)
1282 */
1283LLVMValueRef
1284lp_build_sin(struct lp_build_context *bld,
1285              LLVMValueRef a)
1286{
1287   const struct lp_type type = bld->type;
1288   LLVMTypeRef vec_type = lp_build_vec_type(type);
1289   char intrinsic[32];
1290
1291   /* TODO: optimize the constant case */
1292
1293   assert(type.floating);
1294   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1295
1296   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1297}
1298
1299#endif /* !PIPE_OS_WINDOWS */
1300
1301
1302/**
1303 * Generate pow(x, y)
1304 */
1305LLVMValueRef
1306lp_build_pow(struct lp_build_context *bld,
1307             LLVMValueRef x,
1308             LLVMValueRef y)
1309{
1310   /* TODO: optimize the constant case */
1311   if(LLVMIsConstant(x) && LLVMIsConstant(y))
1312      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1313                   __FUNCTION__);
1314
1315   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1316}
1317
1318
1319/**
1320 * Generate exp(x)
1321 */
1322LLVMValueRef
1323lp_build_exp(struct lp_build_context *bld,
1324             LLVMValueRef x)
1325{
1326   /* log2(e) = 1/log(2) */
1327   LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1328
1329   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1330}
1331
1332
1333/**
1334 * Generate log(x)
1335 */
1336LLVMValueRef
1337lp_build_log(struct lp_build_context *bld,
1338             LLVMValueRef x)
1339{
1340   /* log(2) */
1341   LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1342
1343   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1344}
1345
1346
1347#define EXP_POLY_DEGREE 3
1348#define LOG_POLY_DEGREE 5
1349
1350
1351/**
1352 * Generate polynomial.
1353 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1354 */
1355static LLVMValueRef
1356lp_build_polynomial(struct lp_build_context *bld,
1357                    LLVMValueRef x,
1358                    const double *coeffs,
1359                    unsigned num_coeffs)
1360{
1361   const struct lp_type type = bld->type;
1362   LLVMValueRef res = NULL;
1363   unsigned i;
1364
1365   /* TODO: optimize the constant case */
1366   if(LLVMIsConstant(x))
1367      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1368                   __FUNCTION__);
1369
1370   for (i = num_coeffs; i--; ) {
1371      LLVMValueRef coeff;
1372
1373      coeff = lp_build_const_vec(type, coeffs[i]);
1374
1375      if(res)
1376         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1377      else
1378         res = coeff;
1379   }
1380
1381   if(res)
1382      return res;
1383   else
1384      return bld->undef;
1385}
1386
1387
1388/**
1389 * Minimax polynomial fit of 2**x, in range [0, 1[
1390 */
1391const double lp_build_exp2_polynomial[] = {
1392#if EXP_POLY_DEGREE == 5
1393   0.999999999690134838155,
1394   0.583974334321735217258,
1395   0.164553105719676828492,
1396   0.0292811063701710962255,
1397   0.00354944426657875141846,
1398   0.000296253726543423377365
1399#elif EXP_POLY_DEGREE == 4
1400   1.00000001502262084505,
1401   0.563586057338685991394,
1402   0.150436017652442413623,
1403   0.0243220604213317927308,
1404   0.0025359088446580436489
1405#elif EXP_POLY_DEGREE == 3
1406   0.999925218562710312959,
1407   0.695833540494823811697,
1408   0.226067155427249155588,
1409   0.0780245226406372992967
1410#elif EXP_POLY_DEGREE == 2
1411   1.00172476321474503578,
1412   0.657636275736077639316,
1413   0.33718943461968720704
1414#else
1415#error
1416#endif
1417};
1418
1419
1420void
1421lp_build_exp2_approx(struct lp_build_context *bld,
1422                     LLVMValueRef x,
1423                     LLVMValueRef *p_exp2_int_part,
1424                     LLVMValueRef *p_frac_part,
1425                     LLVMValueRef *p_exp2)
1426{
1427   const struct lp_type type = bld->type;
1428   LLVMTypeRef vec_type = lp_build_vec_type(type);
1429   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1430   LLVMValueRef ipart = NULL;
1431   LLVMValueRef fpart = NULL;
1432   LLVMValueRef expipart = NULL;
1433   LLVMValueRef expfpart = NULL;
1434   LLVMValueRef res = NULL;
1435
1436   if(p_exp2_int_part || p_frac_part || p_exp2) {
1437      /* TODO: optimize the constant case */
1438      if(LLVMIsConstant(x))
1439         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1440                      __FUNCTION__);
1441
1442      assert(type.floating && type.width == 32);
1443
1444      x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1445      x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1446
1447      /* ipart = floor(x) */
1448      ipart = lp_build_floor(bld, x);
1449
1450      /* fpart = x - ipart */
1451      fpart = LLVMBuildSub(bld->builder, x, ipart, "");
1452   }
1453
1454   if(p_exp2_int_part || p_exp2) {
1455      /* expipart = (float) (1 << ipart) */
1456      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1457      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1458      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1459      expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1460   }
1461
1462   if(p_exp2) {
1463      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1464                                     Elements(lp_build_exp2_polynomial));
1465
1466      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1467   }
1468
1469   if(p_exp2_int_part)
1470      *p_exp2_int_part = expipart;
1471
1472   if(p_frac_part)
1473      *p_frac_part = fpart;
1474
1475   if(p_exp2)
1476      *p_exp2 = res;
1477}
1478
1479
1480LLVMValueRef
1481lp_build_exp2(struct lp_build_context *bld,
1482              LLVMValueRef x)
1483{
1484   LLVMValueRef res;
1485   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1486   return res;
1487}
1488
1489
1490/**
1491 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1492 * These coefficients can be generate with
1493 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1494 */
1495const double lp_build_log2_polynomial[] = {
1496#if LOG_POLY_DEGREE == 6
1497   3.11578814719469302614,
1498   -3.32419399085241980044,
1499   2.59883907202499966007,
1500   -1.23152682416275988241,
1501   0.318212422185251071475,
1502   -0.0344359067839062357313
1503#elif LOG_POLY_DEGREE == 5
1504   2.8882704548164776201,
1505   -2.52074962577807006663,
1506   1.48116647521213171641,
1507   -0.465725644288844778798,
1508   0.0596515482674574969533
1509#elif LOG_POLY_DEGREE == 4
1510   2.61761038894603480148,
1511   -1.75647175389045657003,
1512   0.688243882994381274313,
1513   -0.107254423828329604454
1514#elif LOG_POLY_DEGREE == 3
1515   2.28330284476918490682,
1516   -1.04913055217340124191,
1517   0.204446009836232697516
1518#else
1519#error
1520#endif
1521};
1522
1523
1524/**
1525 * See http://www.devmaster.net/forums/showthread.php?p=43580
1526 */
1527void
1528lp_build_log2_approx(struct lp_build_context *bld,
1529                     LLVMValueRef x,
1530                     LLVMValueRef *p_exp,
1531                     LLVMValueRef *p_floor_log2,
1532                     LLVMValueRef *p_log2)
1533{
1534   const struct lp_type type = bld->type;
1535   LLVMTypeRef vec_type = lp_build_vec_type(type);
1536   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1537
1538   LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1539   LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1540   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1541
1542   LLVMValueRef i = NULL;
1543   LLVMValueRef exp = NULL;
1544   LLVMValueRef mant = NULL;
1545   LLVMValueRef logexp = NULL;
1546   LLVMValueRef logmant = NULL;
1547   LLVMValueRef res = NULL;
1548
1549   if(p_exp || p_floor_log2 || p_log2) {
1550      /* TODO: optimize the constant case */
1551      if(LLVMIsConstant(x))
1552         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1553                      __FUNCTION__);
1554
1555      assert(type.floating && type.width == 32);
1556
1557      i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1558
1559      /* exp = (float) exponent(x) */
1560      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1561   }
1562
1563   if(p_floor_log2 || p_log2) {
1564      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1565      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1566      logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1567   }
1568
1569   if(p_log2) {
1570      /* mant = (float) mantissa(x) */
1571      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1572      mant = LLVMBuildOr(bld->builder, mant, one, "");
1573      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1574
1575      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1576                                    Elements(lp_build_log2_polynomial));
1577
1578      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1579      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1580
1581      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1582   }
1583
1584   if(p_exp) {
1585      exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
1586      *p_exp = exp;
1587   }
1588
1589   if(p_floor_log2)
1590      *p_floor_log2 = logexp;
1591
1592   if(p_log2)
1593      *p_log2 = res;
1594}
1595
1596
1597LLVMValueRef
1598lp_build_log2(struct lp_build_context *bld,
1599              LLVMValueRef x)
1600{
1601   LLVMValueRef res;
1602   lp_build_log2_approx(bld, x, NULL, NULL, &res);
1603   return res;
1604}
1605