lp_bld_arit.c revision 8a3a971743a90463e65b44f1769a5301a31ce4cd
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_intr.h"
57#include "lp_bld_logic.h"
58#include "lp_bld_pack.h"
59#include "lp_bld_arit.h"
60
61
62/**
63 * Generate min(a, b)
64 * No checks for special case values of a or b = 1 or 0 are done.
65 */
66static LLVMValueRef
67lp_build_min_simple(struct lp_build_context *bld,
68                    LLVMValueRef a,
69                    LLVMValueRef b)
70{
71   const struct lp_type type = bld->type;
72   const char *intrinsic = NULL;
73   LLVMValueRef cond;
74
75   /* TODO: optimize the constant case */
76
77   if(type.width * type.length == 128) {
78      if(type.floating) {
79         if(type.width == 32 && util_cpu_caps.has_sse)
80            intrinsic = "llvm.x86.sse.min.ps";
81         if(type.width == 64 && util_cpu_caps.has_sse2)
82            intrinsic = "llvm.x86.sse2.min.pd";
83      }
84      else {
85         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
86            intrinsic = "llvm.x86.sse2.pminu.b";
87         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
88            intrinsic = "llvm.x86.sse41.pminsb";
89         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
90            intrinsic = "llvm.x86.sse41.pminuw";
91         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
92            intrinsic = "llvm.x86.sse2.pmins.w";
93         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
94            intrinsic = "llvm.x86.sse41.pminud";
95         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
96            intrinsic = "llvm.x86.sse41.pminsd";
97      }
98   }
99
100   if(intrinsic)
101      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
102
103   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
104   return lp_build_select(bld, cond, a, b);
105}
106
107
108/**
109 * Generate max(a, b)
110 * No checks for special case values of a or b = 1 or 0 are done.
111 */
112static LLVMValueRef
113lp_build_max_simple(struct lp_build_context *bld,
114                    LLVMValueRef a,
115                    LLVMValueRef b)
116{
117   const struct lp_type type = bld->type;
118   const char *intrinsic = NULL;
119   LLVMValueRef cond;
120
121   /* TODO: optimize the constant case */
122
123   if(type.width * type.length == 128) {
124      if(type.floating) {
125         if(type.width == 32 && util_cpu_caps.has_sse)
126            intrinsic = "llvm.x86.sse.max.ps";
127         if(type.width == 64 && util_cpu_caps.has_sse2)
128            intrinsic = "llvm.x86.sse2.max.pd";
129      }
130      else {
131         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
132            intrinsic = "llvm.x86.sse2.pmaxu.b";
133         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
134            intrinsic = "llvm.x86.sse41.pmaxsb";
135         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
136            intrinsic = "llvm.x86.sse41.pmaxuw";
137         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
138            intrinsic = "llvm.x86.sse2.pmaxs.w";
139         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
140            intrinsic = "llvm.x86.sse41.pmaxud";
141         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
142            intrinsic = "llvm.x86.sse41.pmaxsd";
143      }
144   }
145
146   if(intrinsic)
147      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
148
149   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
150   return lp_build_select(bld, cond, a, b);
151}
152
153
154/**
155 * Generate 1 - a, or ~a depending on bld->type.
156 */
157LLVMValueRef
158lp_build_comp(struct lp_build_context *bld,
159              LLVMValueRef a)
160{
161   const struct lp_type type = bld->type;
162
163   if(a == bld->one)
164      return bld->zero;
165   if(a == bld->zero)
166      return bld->one;
167
168   if(type.norm && !type.floating && !type.fixed && !type.sign) {
169      if(LLVMIsConstant(a))
170         return LLVMConstNot(a);
171      else
172         return LLVMBuildNot(bld->builder, a, "");
173   }
174
175   if(LLVMIsConstant(a))
176      if (type.floating)
177          return LLVMConstFSub(bld->one, a);
178      else
179          return LLVMConstSub(bld->one, a);
180   else
181      if (type.floating)
182         return LLVMBuildFSub(bld->builder, bld->one, a, "");
183      else
184         return LLVMBuildSub(bld->builder, bld->one, a, "");
185}
186
187
188/**
189 * Generate a + b
190 */
191LLVMValueRef
192lp_build_add(struct lp_build_context *bld,
193             LLVMValueRef a,
194             LLVMValueRef b)
195{
196   const struct lp_type type = bld->type;
197   LLVMValueRef res;
198
199   assert(lp_check_value(type, a));
200   assert(lp_check_value(type, b));
201
202   if(a == bld->zero)
203      return b;
204   if(b == bld->zero)
205      return a;
206   if(a == bld->undef || b == bld->undef)
207      return bld->undef;
208
209   if(bld->type.norm) {
210      const char *intrinsic = NULL;
211
212      if(a == bld->one || b == bld->one)
213        return bld->one;
214
215      if(util_cpu_caps.has_sse2 &&
216         type.width * type.length == 128 &&
217         !type.floating && !type.fixed) {
218         if(type.width == 8)
219            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
220         if(type.width == 16)
221            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
222      }
223
224      if(intrinsic)
225         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
226   }
227
228   if(LLVMIsConstant(a) && LLVMIsConstant(b))
229      if (type.floating)
230         res = LLVMConstFAdd(a, b);
231      else
232         res = LLVMConstAdd(a, b);
233   else
234      if (type.floating)
235         res = LLVMBuildFAdd(bld->builder, a, b, "");
236      else
237         res = LLVMBuildAdd(bld->builder, a, b, "");
238
239   /* clamp to ceiling of 1.0 */
240   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
241      res = lp_build_min_simple(bld, res, bld->one);
242
243   /* XXX clamp to floor of -1 or 0??? */
244
245   return res;
246}
247
248
249/** Return the sum of the elements of a */
250LLVMValueRef
251lp_build_sum_vector(struct lp_build_context *bld,
252                    LLVMValueRef a)
253{
254   const struct lp_type type = bld->type;
255   LLVMValueRef index, res;
256   unsigned i;
257
258   if (a == bld->zero)
259      return bld->zero;
260   if (a == bld->undef)
261      return bld->undef;
262   assert(type.length > 1);
263
264   assert(!bld->type.norm);
265
266   index = LLVMConstInt(LLVMInt32Type(), 0, 0);
267   res = LLVMBuildExtractElement(bld->builder, a, index, "");
268
269   for (i = 1; i < type.length; i++) {
270      index = LLVMConstInt(LLVMInt32Type(), i, 0);
271      if (type.floating)
272         res = LLVMBuildFAdd(bld->builder, res,
273                            LLVMBuildExtractElement(bld->builder,
274                                                    a, index, ""),
275                            "");
276      else
277         res = LLVMBuildAdd(bld->builder, res,
278                            LLVMBuildExtractElement(bld->builder,
279                                                    a, index, ""),
280                            "");
281   }
282
283   return res;
284}
285
286
287/**
288 * Generate a - b
289 */
290LLVMValueRef
291lp_build_sub(struct lp_build_context *bld,
292             LLVMValueRef a,
293             LLVMValueRef b)
294{
295   const struct lp_type type = bld->type;
296   LLVMValueRef res;
297
298   assert(lp_check_value(type, a));
299   assert(lp_check_value(type, b));
300
301   if(b == bld->zero)
302      return a;
303   if(a == bld->undef || b == bld->undef)
304      return bld->undef;
305   if(a == b)
306      return bld->zero;
307
308   if(bld->type.norm) {
309      const char *intrinsic = NULL;
310
311      if(b == bld->one)
312        return bld->zero;
313
314      if(util_cpu_caps.has_sse2 &&
315         type.width * type.length == 128 &&
316         !type.floating && !type.fixed) {
317         if(type.width == 8)
318            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
319         if(type.width == 16)
320            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
321      }
322
323      if(intrinsic)
324         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
325   }
326
327   if(LLVMIsConstant(a) && LLVMIsConstant(b))
328      if (type.floating)
329         res = LLVMConstFSub(a, b);
330      else
331         res = LLVMConstSub(a, b);
332   else
333      if (type.floating)
334         res = LLVMBuildFSub(bld->builder, a, b, "");
335      else
336         res = LLVMBuildSub(bld->builder, a, b, "");
337
338   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
339      res = lp_build_max_simple(bld, res, bld->zero);
340
341   return res;
342}
343
344
345/**
346 * Normalized 8bit multiplication.
347 *
348 * - alpha plus one
349 *
350 *     makes the following approximation to the division (Sree)
351 *
352 *       a*b/255 ~= (a*(b + 1)) >> 256
353 *
354 *     which is the fastest method that satisfies the following OpenGL criteria
355 *
356 *       0*0 = 0 and 255*255 = 255
357 *
358 * - geometric series
359 *
360 *     takes the geometric series approximation to the division
361 *
362 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
363 *
364 *     in this case just the first two terms to fit in 16bit arithmetic
365 *
366 *       t/255 ~= (t + (t >> 8)) >> 8
367 *
368 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
369 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
370 *     must be used
371 *
372 * - geometric series plus rounding
373 *
374 *     when using a geometric series division instead of truncating the result
375 *     use roundoff in the approximation (Jim Blinn)
376 *
377 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
378 *
379 *     achieving the exact results
380 *
381 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
382 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
383 * @sa Michael Herf, The "double blend trick", May 2000,
384 *     http://www.stereopsis.com/doubleblend.html
385 */
386static LLVMValueRef
387lp_build_mul_u8n(LLVMBuilderRef builder,
388                 struct lp_type i16_type,
389                 LLVMValueRef a, LLVMValueRef b)
390{
391   LLVMValueRef c8;
392   LLVMValueRef ab;
393
394   c8 = lp_build_const_int_vec(i16_type, 8);
395
396#if 0
397
398   /* a*b/255 ~= (a*(b + 1)) >> 256 */
399   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
400   ab = LLVMBuildMul(builder, a, b, "");
401
402#else
403
404   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
405   ab = LLVMBuildMul(builder, a, b, "");
406   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
407   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
408
409#endif
410
411   ab = LLVMBuildLShr(builder, ab, c8, "");
412
413   return ab;
414}
415
416
417/**
418 * Generate a * b
419 */
420LLVMValueRef
421lp_build_mul(struct lp_build_context *bld,
422             LLVMValueRef a,
423             LLVMValueRef b)
424{
425   const struct lp_type type = bld->type;
426   LLVMValueRef shift;
427   LLVMValueRef res;
428
429   assert(lp_check_value(type, a));
430   assert(lp_check_value(type, b));
431
432   if(a == bld->zero)
433      return bld->zero;
434   if(a == bld->one)
435      return b;
436   if(b == bld->zero)
437      return bld->zero;
438   if(b == bld->one)
439      return a;
440   if(a == bld->undef || b == bld->undef)
441      return bld->undef;
442
443   if(!type.floating && !type.fixed && type.norm) {
444      if(type.width == 8) {
445         struct lp_type i16_type = lp_wider_type(type);
446         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
447
448         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
449         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
450
451         /* PMULLW, PSRLW, PADDW */
452         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
453         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
454
455         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
456
457         return ab;
458      }
459
460      /* FIXME */
461      assert(0);
462   }
463
464   if(type.fixed)
465      shift = lp_build_const_int_vec(type, type.width/2);
466   else
467      shift = NULL;
468
469   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
470      if (type.floating)
471         res = LLVMConstFMul(a, b);
472      else
473         res = LLVMConstMul(a, b);
474      if(shift) {
475         if(type.sign)
476            res = LLVMConstAShr(res, shift);
477         else
478            res = LLVMConstLShr(res, shift);
479      }
480   }
481   else {
482      if (type.floating)
483         res = LLVMBuildFMul(bld->builder, a, b, "");
484      else
485         res = LLVMBuildMul(bld->builder, a, b, "");
486      if(shift) {
487         if(type.sign)
488            res = LLVMBuildAShr(bld->builder, res, shift, "");
489         else
490            res = LLVMBuildLShr(bld->builder, res, shift, "");
491      }
492   }
493
494   return res;
495}
496
497
498/**
499 * Small vector x scale multiplication optimization.
500 */
501LLVMValueRef
502lp_build_mul_imm(struct lp_build_context *bld,
503                 LLVMValueRef a,
504                 int b)
505{
506   LLVMValueRef factor;
507
508   if(b == 0)
509      return bld->zero;
510
511   if(b == 1)
512      return a;
513
514   if(b == -1)
515      return lp_build_negate(bld, a);
516
517   if(b == 2 && bld->type.floating)
518      return lp_build_add(bld, a, a);
519
520   if(util_is_pot(b)) {
521      unsigned shift = ffs(b) - 1;
522
523      if(bld->type.floating) {
524#if 0
525         /*
526          * Power of two multiplication by directly manipulating the mantissa.
527          *
528          * XXX: This might not be always faster, it will introduce a small error
529          * for multiplication by zero, and it will produce wrong results
530          * for Inf and NaN.
531          */
532         unsigned mantissa = lp_mantissa(bld->type);
533         factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
534         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
535         a = LLVMBuildAdd(bld->builder, a, factor, "");
536         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
537         return a;
538#endif
539      }
540      else {
541         factor = lp_build_const_vec(bld->type, shift);
542         return LLVMBuildShl(bld->builder, a, factor, "");
543      }
544   }
545
546   factor = lp_build_const_vec(bld->type, (double)b);
547   return lp_build_mul(bld, a, factor);
548}
549
550
551/**
552 * Generate a / b
553 */
554LLVMValueRef
555lp_build_div(struct lp_build_context *bld,
556             LLVMValueRef a,
557             LLVMValueRef b)
558{
559   const struct lp_type type = bld->type;
560
561   assert(lp_check_value(type, a));
562   assert(lp_check_value(type, b));
563
564   if(a == bld->zero)
565      return bld->zero;
566   if(a == bld->one)
567      return lp_build_rcp(bld, b);
568   if(b == bld->zero)
569      return bld->undef;
570   if(b == bld->one)
571      return a;
572   if(a == bld->undef || b == bld->undef)
573      return bld->undef;
574
575   if(LLVMIsConstant(a) && LLVMIsConstant(b))
576      return LLVMConstFDiv(a, b);
577
578   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
579      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
580
581   return LLVMBuildFDiv(bld->builder, a, b, "");
582}
583
584
585/**
586 * Linear interpolation.
587 *
588 * This also works for integer values with a few caveats.
589 *
590 * @sa http://www.stereopsis.com/doubleblend.html
591 */
592LLVMValueRef
593lp_build_lerp(struct lp_build_context *bld,
594              LLVMValueRef x,
595              LLVMValueRef v0,
596              LLVMValueRef v1)
597{
598   LLVMValueRef delta;
599   LLVMValueRef res;
600
601   delta = lp_build_sub(bld, v1, v0);
602
603   res = lp_build_mul(bld, x, delta);
604
605   res = lp_build_add(bld, v0, res);
606
607   if(bld->type.fixed)
608      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
609       * but it will be wrong for other uses. Basically we need a more
610       * powerful lp_type, capable of further distinguishing the values
611       * interpretation from the value storage. */
612      res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
613
614   return res;
615}
616
617
618LLVMValueRef
619lp_build_lerp_2d(struct lp_build_context *bld,
620                 LLVMValueRef x,
621                 LLVMValueRef y,
622                 LLVMValueRef v00,
623                 LLVMValueRef v01,
624                 LLVMValueRef v10,
625                 LLVMValueRef v11)
626{
627   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
628   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
629   return lp_build_lerp(bld, y, v0, v1);
630}
631
632
633/**
634 * Generate min(a, b)
635 * Do checks for special cases.
636 */
637LLVMValueRef
638lp_build_min(struct lp_build_context *bld,
639             LLVMValueRef a,
640             LLVMValueRef b)
641{
642   if(a == bld->undef || b == bld->undef)
643      return bld->undef;
644
645   if(a == b)
646      return a;
647
648   if(bld->type.norm) {
649      if(a == bld->zero || b == bld->zero)
650         return bld->zero;
651      if(a == bld->one)
652         return b;
653      if(b == bld->one)
654         return a;
655   }
656
657   return lp_build_min_simple(bld, a, b);
658}
659
660
661/**
662 * Generate max(a, b)
663 * Do checks for special cases.
664 */
665LLVMValueRef
666lp_build_max(struct lp_build_context *bld,
667             LLVMValueRef a,
668             LLVMValueRef b)
669{
670   if(a == bld->undef || b == bld->undef)
671      return bld->undef;
672
673   if(a == b)
674      return a;
675
676   if(bld->type.norm) {
677      if(a == bld->one || b == bld->one)
678         return bld->one;
679      if(a == bld->zero)
680         return b;
681      if(b == bld->zero)
682         return a;
683   }
684
685   return lp_build_max_simple(bld, a, b);
686}
687
688
689/**
690 * Generate clamp(a, min, max)
691 * Do checks for special cases.
692 */
693LLVMValueRef
694lp_build_clamp(struct lp_build_context *bld,
695               LLVMValueRef a,
696               LLVMValueRef min,
697               LLVMValueRef max)
698{
699   a = lp_build_min(bld, a, max);
700   a = lp_build_max(bld, a, min);
701   return a;
702}
703
704
705/**
706 * Generate abs(a)
707 */
708LLVMValueRef
709lp_build_abs(struct lp_build_context *bld,
710             LLVMValueRef a)
711{
712   const struct lp_type type = bld->type;
713   LLVMTypeRef vec_type = lp_build_vec_type(type);
714
715   if(!type.sign)
716      return a;
717
718   if(type.floating) {
719      /* Mask out the sign bit */
720      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
721      unsigned long long absMask = ~(1ULL << (type.width - 1));
722      LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
723      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
724      a = LLVMBuildAnd(bld->builder, a, mask, "");
725      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
726      return a;
727   }
728
729   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
730      switch(type.width) {
731      case 8:
732         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
733      case 16:
734         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
735      case 32:
736         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
737      }
738   }
739
740   return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
741}
742
743
744LLVMValueRef
745lp_build_negate(struct lp_build_context *bld,
746                LLVMValueRef a)
747{
748#if HAVE_LLVM >= 0x0207
749   if (bld->type.floating)
750      a = LLVMBuildFNeg(bld->builder, a, "");
751   else
752#endif
753      a = LLVMBuildNeg(bld->builder, a, "");
754
755   return a;
756}
757
758
759/** Return -1, 0 or +1 depending on the sign of a */
760LLVMValueRef
761lp_build_sgn(struct lp_build_context *bld,
762             LLVMValueRef a)
763{
764   const struct lp_type type = bld->type;
765   LLVMValueRef cond;
766   LLVMValueRef res;
767
768   /* Handle non-zero case */
769   if(!type.sign) {
770      /* if not zero then sign must be positive */
771      res = bld->one;
772   }
773   else if(type.floating) {
774      LLVMTypeRef vec_type;
775      LLVMTypeRef int_type;
776      LLVMValueRef mask;
777      LLVMValueRef sign;
778      LLVMValueRef one;
779      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
780
781      int_type = lp_build_int_vec_type(type);
782      vec_type = lp_build_vec_type(type);
783      mask = lp_build_const_int_vec(type, maskBit);
784
785      /* Take the sign bit and add it to 1 constant */
786      sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
787      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
788      one = LLVMConstBitCast(bld->one, int_type);
789      res = LLVMBuildOr(bld->builder, sign, one, "");
790      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
791   }
792   else
793   {
794      LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
795      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
796      res = lp_build_select(bld, cond, bld->one, minus_one);
797   }
798
799   /* Handle zero */
800   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
801   res = lp_build_select(bld, cond, bld->zero, res);
802
803   return res;
804}
805
806
807/**
808 * Set the sign of float vector 'a' according to 'sign'.
809 * If sign==0, return abs(a).
810 * If sign==1, return -abs(a);
811 * Other values for sign produce undefined results.
812 */
813LLVMValueRef
814lp_build_set_sign(struct lp_build_context *bld,
815                  LLVMValueRef a, LLVMValueRef sign)
816{
817   const struct lp_type type = bld->type;
818   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
819   LLVMTypeRef vec_type = lp_build_vec_type(type);
820   LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
821   LLVMValueRef mask = lp_build_const_int_vec(type,
822                             ~((unsigned long long) 1 << (type.width - 1)));
823   LLVMValueRef val, res;
824
825   assert(type.floating);
826
827   /* val = reinterpret_cast<int>(a) */
828   val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
829   /* val = val & mask */
830   val = LLVMBuildAnd(bld->builder, val, mask, "");
831   /* sign = sign << shift */
832   sign = LLVMBuildShl(bld->builder, sign, shift, "");
833   /* res = val | sign */
834   res = LLVMBuildOr(bld->builder, val, sign, "");
835   /* res = reinterpret_cast<float>(res) */
836   res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
837
838   return res;
839}
840
841
842/**
843 * Convert vector of (or scalar) int to vector of (or scalar) float.
844 */
845LLVMValueRef
846lp_build_int_to_float(struct lp_build_context *bld,
847                      LLVMValueRef a)
848{
849   const struct lp_type type = bld->type;
850   LLVMTypeRef vec_type = lp_build_vec_type(type);
851
852   assert(type.floating);
853
854   return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
855}
856
857
858
859enum lp_build_round_sse41_mode
860{
861   LP_BUILD_ROUND_SSE41_NEAREST = 0,
862   LP_BUILD_ROUND_SSE41_FLOOR = 1,
863   LP_BUILD_ROUND_SSE41_CEIL = 2,
864   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
865};
866
867
868static INLINE LLVMValueRef
869lp_build_round_sse41(struct lp_build_context *bld,
870                     LLVMValueRef a,
871                     enum lp_build_round_sse41_mode mode)
872{
873   const struct lp_type type = bld->type;
874   LLVMTypeRef vec_type = lp_build_vec_type(type);
875   const char *intrinsic;
876
877   assert(type.floating);
878   assert(type.width*type.length == 128);
879   assert(lp_check_value(type, a));
880   assert(util_cpu_caps.has_sse4_1);
881
882   switch(type.width) {
883   case 32:
884      intrinsic = "llvm.x86.sse41.round.ps";
885      break;
886   case 64:
887      intrinsic = "llvm.x86.sse41.round.pd";
888      break;
889   default:
890      assert(0);
891      return bld->undef;
892   }
893
894   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
895                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
896}
897
898
899/**
900 * Return the integer part of a float (vector) value.  The returned value is
901 * a float (vector).
902 * Ex: trunc(-1.5) = 1.0
903 */
904LLVMValueRef
905lp_build_trunc(struct lp_build_context *bld,
906               LLVMValueRef a)
907{
908   const struct lp_type type = bld->type;
909
910   assert(type.floating);
911   assert(lp_check_value(type, a));
912
913   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
914      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
915   else {
916      LLVMTypeRef vec_type = lp_build_vec_type(type);
917      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
918      LLVMValueRef res;
919      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
920      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
921      return res;
922   }
923}
924
925
926/**
927 * Return float (vector) rounded to nearest integer (vector).  The returned
928 * value is a float (vector).
929 * Ex: round(0.9) = 1.0
930 * Ex: round(-1.5) = -2.0
931 */
932LLVMValueRef
933lp_build_round(struct lp_build_context *bld,
934               LLVMValueRef a)
935{
936   const struct lp_type type = bld->type;
937
938   assert(type.floating);
939   assert(lp_check_value(type, a));
940
941   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
942      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
943   else {
944      LLVMTypeRef vec_type = lp_build_vec_type(type);
945      LLVMValueRef res;
946      res = lp_build_iround(bld, a);
947      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
948      return res;
949   }
950}
951
952
953/**
954 * Return floor of float (vector), result is a float (vector)
955 * Ex: floor(1.1) = 1.0
956 * Ex: floor(-1.1) = -2.0
957 */
958LLVMValueRef
959lp_build_floor(struct lp_build_context *bld,
960               LLVMValueRef a)
961{
962   const struct lp_type type = bld->type;
963
964   assert(type.floating);
965   assert(lp_check_value(type, a));
966
967   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
968      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
969   else {
970      LLVMTypeRef vec_type = lp_build_vec_type(type);
971      LLVMValueRef res;
972      res = lp_build_ifloor(bld, a);
973      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
974      return res;
975   }
976}
977
978
979/**
980 * Return ceiling of float (vector), returning float (vector).
981 * Ex: ceil( 1.1) = 2.0
982 * Ex: ceil(-1.1) = -1.0
983 */
984LLVMValueRef
985lp_build_ceil(struct lp_build_context *bld,
986              LLVMValueRef a)
987{
988   const struct lp_type type = bld->type;
989
990   assert(type.floating);
991   assert(lp_check_value(type, a));
992
993   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
994      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
995   else {
996      LLVMTypeRef vec_type = lp_build_vec_type(type);
997      LLVMValueRef res;
998      res = lp_build_iceil(bld, a);
999      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1000      return res;
1001   }
1002}
1003
1004
1005/**
1006 * Return fractional part of 'a' computed as a - floor(a)
1007 * Typically used in texture coord arithmetic.
1008 */
1009LLVMValueRef
1010lp_build_fract(struct lp_build_context *bld,
1011               LLVMValueRef a)
1012{
1013   assert(bld->type.floating);
1014   return lp_build_sub(bld, a, lp_build_floor(bld, a));
1015}
1016
1017
1018/**
1019 * Return the integer part of a float (vector) value.  The returned value is
1020 * an integer (vector).
1021 * Ex: itrunc(-1.5) = 1
1022 */
1023LLVMValueRef
1024lp_build_itrunc(struct lp_build_context *bld,
1025                LLVMValueRef a)
1026{
1027   const struct lp_type type = bld->type;
1028   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1029
1030   assert(type.floating);
1031   assert(lp_check_value(type, a));
1032
1033   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1034}
1035
1036
1037/**
1038 * Return float (vector) rounded to nearest integer (vector).  The returned
1039 * value is an integer (vector).
1040 * Ex: iround(0.9) = 1
1041 * Ex: iround(-1.5) = -2
1042 */
1043LLVMValueRef
1044lp_build_iround(struct lp_build_context *bld,
1045                LLVMValueRef a)
1046{
1047   const struct lp_type type = bld->type;
1048   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1049   LLVMValueRef res;
1050
1051   assert(type.floating);
1052
1053   assert(lp_check_value(type, a));
1054
1055   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1056      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1057   }
1058   else {
1059      LLVMTypeRef vec_type = lp_build_vec_type(type);
1060      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1061      LLVMValueRef sign;
1062      LLVMValueRef half;
1063
1064      /* get sign bit */
1065      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1066      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1067
1068      /* sign * 0.5 */
1069      half = lp_build_const_vec(type, 0.5);
1070      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1071      half = LLVMBuildOr(bld->builder, sign, half, "");
1072      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1073
1074      res = LLVMBuildFAdd(bld->builder, a, half, "");
1075   }
1076
1077   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1078
1079   return res;
1080}
1081
1082
1083/**
1084 * Return floor of float (vector), result is an int (vector)
1085 * Ex: ifloor(1.1) = 1.0
1086 * Ex: ifloor(-1.1) = -2.0
1087 */
1088LLVMValueRef
1089lp_build_ifloor(struct lp_build_context *bld,
1090                LLVMValueRef a)
1091{
1092   const struct lp_type type = bld->type;
1093   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1094   LLVMValueRef res;
1095
1096   assert(type.floating);
1097   assert(lp_check_value(type, a));
1098
1099   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1100      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1101   }
1102   else {
1103      /* Take the sign bit and add it to 1 constant */
1104      LLVMTypeRef vec_type = lp_build_vec_type(type);
1105      unsigned mantissa = lp_mantissa(type);
1106      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1107      LLVMValueRef sign;
1108      LLVMValueRef offset;
1109
1110      /* sign = a < 0 ? ~0 : 0 */
1111      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1112      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1113      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1114
1115      /* offset = -0.99999(9)f */
1116      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1117      offset = LLVMConstBitCast(offset, int_vec_type);
1118
1119      /* offset = a < 0 ? offset : 0.0f */
1120      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1121      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1122
1123      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1124   }
1125
1126   /* round to nearest (toward zero) */
1127   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1128
1129   return res;
1130}
1131
1132
1133/**
1134 * Return ceiling of float (vector), returning int (vector).
1135 * Ex: iceil( 1.1) = 2
1136 * Ex: iceil(-1.1) = -1
1137 */
1138LLVMValueRef
1139lp_build_iceil(struct lp_build_context *bld,
1140               LLVMValueRef a)
1141{
1142   const struct lp_type type = bld->type;
1143   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1144   LLVMValueRef res;
1145
1146   assert(type.floating);
1147   assert(lp_check_value(type, a));
1148
1149   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1150      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1151   }
1152   else {
1153      LLVMTypeRef vec_type = lp_build_vec_type(type);
1154      unsigned mantissa = lp_mantissa(type);
1155      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1156      LLVMValueRef sign;
1157      LLVMValueRef offset;
1158
1159      /* sign = a < 0 ? 0 : ~0 */
1160      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1161      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1162      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1163      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1164
1165      /* offset = 0.99999(9)f */
1166      offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1167      offset = LLVMConstBitCast(offset, int_vec_type);
1168
1169      /* offset = a < 0 ? 0.0 : offset */
1170      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1171      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1172
1173      res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1174   }
1175
1176   /* round to nearest (toward zero) */
1177   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1178
1179   return res;
1180}
1181
1182
1183LLVMValueRef
1184lp_build_sqrt(struct lp_build_context *bld,
1185              LLVMValueRef a)
1186{
1187   const struct lp_type type = bld->type;
1188   LLVMTypeRef vec_type = lp_build_vec_type(type);
1189   char intrinsic[32];
1190
1191   /* TODO: optimize the constant case */
1192   /* TODO: optimize the constant case */
1193
1194   assert(type.floating);
1195   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1196
1197   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1198}
1199
1200
1201LLVMValueRef
1202lp_build_rcp(struct lp_build_context *bld,
1203             LLVMValueRef a)
1204{
1205   const struct lp_type type = bld->type;
1206
1207   if(a == bld->zero)
1208      return bld->undef;
1209   if(a == bld->one)
1210      return bld->one;
1211   if(a == bld->undef)
1212      return bld->undef;
1213
1214   assert(type.floating);
1215
1216   if(LLVMIsConstant(a))
1217      return LLVMConstFDiv(bld->one, a);
1218
1219   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1220      /*
1221       * XXX: Added precision is not always necessary, so only enable this
1222       * when we have a better system in place to track minimum precision.
1223       */
1224
1225#if 0
1226      /*
1227       * Do one Newton-Raphson step to improve precision:
1228       *
1229       *   x1 = (2 - a * rcp(a)) * rcp(a)
1230       */
1231
1232      LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1233      LLVMValueRef rcp_a;
1234      LLVMValueRef res;
1235
1236      rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1237
1238      res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1239      res = LLVMBuildFSub(bld->builder, two, res, "");
1240      res = LLVMBuildFMul(bld->builder, res, rcp_a, "");
1241
1242      return rcp_a;
1243#else
1244      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1245#endif
1246   }
1247
1248   return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1249}
1250
1251
1252/**
1253 * Generate 1/sqrt(a)
1254 */
1255LLVMValueRef
1256lp_build_rsqrt(struct lp_build_context *bld,
1257               LLVMValueRef a)
1258{
1259   const struct lp_type type = bld->type;
1260
1261   assert(type.floating);
1262
1263   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1264      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1265
1266   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1267}
1268
1269
1270static inline LLVMValueRef
1271lp_build_const_v4si(unsigned long value)
1272{
1273   LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1274   LLVMValueRef elements[4] = { element, element, element, element };
1275   return LLVMConstVector(elements, 4);
1276}
1277
1278static inline LLVMValueRef
1279lp_build_const_v4sf(float value)
1280{
1281   LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1282   LLVMValueRef elements[4] = { element, element, element, element };
1283   return LLVMConstVector(elements, 4);
1284}
1285
1286
1287/**
1288 * Generate sin(a) using SSE2
1289 */
1290LLVMValueRef
1291lp_build_sin(struct lp_build_context *bld,
1292             LLVMValueRef a)
1293{
1294   struct lp_type int_type = lp_int_type(bld->type);
1295   LLVMBuilderRef b = bld->builder;
1296   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1297   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1298
1299   /*
1300    *  take the absolute value,
1301    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1302    */
1303
1304   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1305   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1306
1307   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1308   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1309
1310   /*
1311    * extract the sign bit (upper one)
1312    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1313    */
1314   LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1315   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1316
1317   /*
1318    * scale by 4/Pi
1319    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1320    */
1321
1322   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1323   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1324
1325   /*
1326    * store the integer part of y in mm0
1327    * emm2 = _mm_cvttps_epi32(y);
1328    */
1329
1330   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1331
1332   /*
1333    * j=(j+1) & (~1) (see the cephes sources)
1334    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1335    */
1336
1337   LLVMValueRef all_one = lp_build_const_v4si(1);
1338   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1339   /*
1340    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1341    */
1342   LLVMValueRef inv_one = lp_build_const_v4si(~1);
1343   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1344
1345   /*
1346    * y = _mm_cvtepi32_ps(emm2);
1347    */
1348   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1349
1350   /* get the swap sign flag
1351    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1352    */
1353   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1354   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1355
1356   /*
1357    * emm2 = _mm_slli_epi32(emm0, 29);
1358    */
1359   LLVMValueRef const_29 = lp_build_const_v4si(29);
1360   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1361
1362   /*
1363    * get the polynom selection mask
1364    * there is one polynom for 0 <= x <= Pi/4
1365    * and another one for Pi/4<x<=Pi/2
1366    * Both branches will be computed.
1367    *
1368    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1369    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1370    */
1371
1372   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1373   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1374   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1375                                             emm2_3, lp_build_const_v4si(0));
1376   /*
1377    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1378    */
1379   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1380
1381   /*
1382    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1383    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1384    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1385    */
1386   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1387   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1388   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1389
1390   /*
1391    * The magic pass: "Extended precision modular arithmetic"
1392    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1393    * xmm1 = _mm_mul_ps(y, xmm1);
1394    * xmm2 = _mm_mul_ps(y, xmm2);
1395    * xmm3 = _mm_mul_ps(y, xmm3);
1396    */
1397   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1398   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1399   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1400
1401   /*
1402    * x = _mm_add_ps(x, xmm1);
1403    * x = _mm_add_ps(x, xmm2);
1404    * x = _mm_add_ps(x, xmm3);
1405    */
1406
1407   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1408   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1409   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1410
1411   /*
1412    * Evaluate the first polynom  (0 <= x <= Pi/4)
1413    *
1414    * z = _mm_mul_ps(x,x);
1415    */
1416   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1417
1418   /*
1419    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1420    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1421    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1422    */
1423   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1424   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1425   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1426
1427   /*
1428    * y = *(v4sf*)_ps_coscof_p0;
1429    * y = _mm_mul_ps(y, z);
1430    */
1431   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1432   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1433   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1434   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1435   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1436   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1437
1438
1439   /*
1440    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1441    * y = _mm_sub_ps(y, tmp);
1442    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1443    */
1444   LLVMValueRef half = lp_build_const_v4sf(0.5);
1445   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1446   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1447   LLVMValueRef one = lp_build_const_v4sf(1.0);
1448   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1449
1450   /*
1451    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1452    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1453    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1454    */
1455   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1456   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1457   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1458
1459   /*
1460    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1461    *
1462    * y2 = *(v4sf*)_ps_sincof_p0;
1463    * y2 = _mm_mul_ps(y2, z);
1464    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1465    * y2 = _mm_mul_ps(y2, z);
1466    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1467    * y2 = _mm_mul_ps(y2, z);
1468    * y2 = _mm_mul_ps(y2, x);
1469    * y2 = _mm_add_ps(y2, x);
1470    */
1471
1472   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1473   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1474   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1475   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1476   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1477   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1478   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1479
1480   /*
1481    * select the correct result from the two polynoms
1482    * xmm3 = poly_mask;
1483    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1484    * y = _mm_andnot_ps(xmm3, y);
1485    * y = _mm_add_ps(y,y2);
1486    */
1487   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1488   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1489   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1490   LLVMValueRef inv = lp_build_const_v4si(~0);
1491   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1492   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1493   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1494
1495   /*
1496    * update the sign
1497    * y = _mm_xor_ps(y, sign_bit);
1498    */
1499   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1500   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1501   return y_result;
1502}
1503
1504
1505/**
1506 * Generate cos(a) using SSE2
1507 */
1508LLVMValueRef
1509lp_build_cos(struct lp_build_context *bld,
1510             LLVMValueRef a)
1511{
1512   struct lp_type int_type = lp_int_type(bld->type);
1513   LLVMBuilderRef b = bld->builder;
1514   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1515   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1516
1517   /*
1518    *  take the absolute value,
1519    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1520    */
1521
1522   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1523   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1524
1525   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1526   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1527
1528   /*
1529    * scale by 4/Pi
1530    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1531    */
1532
1533   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1534   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1535
1536   /*
1537    * store the integer part of y in mm0
1538    * emm2 = _mm_cvttps_epi32(y);
1539    */
1540
1541   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1542
1543   /*
1544    * j=(j+1) & (~1) (see the cephes sources)
1545    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1546    */
1547
1548   LLVMValueRef all_one = lp_build_const_v4si(1);
1549   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1550   /*
1551    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1552    */
1553   LLVMValueRef inv_one = lp_build_const_v4si(~1);
1554   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1555
1556   /*
1557    * y = _mm_cvtepi32_ps(emm2);
1558    */
1559   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1560
1561
1562   /*
1563    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1564    */
1565   LLVMValueRef const_2 = lp_build_const_v4si(2);
1566   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1567
1568
1569   /* get the swap sign flag
1570    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1571    */
1572   LLVMValueRef inv = lp_build_const_v4si(~0);
1573   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1574   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1575   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1576
1577   /*
1578    * emm2 = _mm_slli_epi32(emm0, 29);
1579    */
1580   LLVMValueRef const_29 = lp_build_const_v4si(29);
1581   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1582
1583   /*
1584    * get the polynom selection mask
1585    * there is one polynom for 0 <= x <= Pi/4
1586    * and another one for Pi/4<x<=Pi/2
1587    * Both branches will be computed.
1588    *
1589    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1590    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1591    */
1592
1593   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1594   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1595   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1596   				             emm2_3, lp_build_const_v4si(0));
1597
1598   /*
1599    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1600    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1601    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1602    */
1603   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1604   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1605   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1606
1607   /*
1608    * The magic pass: "Extended precision modular arithmetic"
1609    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1610    * xmm1 = _mm_mul_ps(y, xmm1);
1611    * xmm2 = _mm_mul_ps(y, xmm2);
1612    * xmm3 = _mm_mul_ps(y, xmm3);
1613    */
1614   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1615   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1616   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1617
1618   /*
1619    * x = _mm_add_ps(x, xmm1);
1620    * x = _mm_add_ps(x, xmm2);
1621    * x = _mm_add_ps(x, xmm3);
1622    */
1623
1624   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1625   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1626   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1627
1628   /*
1629    * Evaluate the first polynom  (0 <= x <= Pi/4)
1630    *
1631    * z = _mm_mul_ps(x,x);
1632    */
1633   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1634
1635   /*
1636    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1637    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1638    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1639    */
1640   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1641   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1642   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1643
1644   /*
1645    * y = *(v4sf*)_ps_coscof_p0;
1646    * y = _mm_mul_ps(y, z);
1647    */
1648   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1649   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1650   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1651   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1652   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1653   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1654
1655
1656   /*
1657    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1658    * y = _mm_sub_ps(y, tmp);
1659    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1660    */
1661   LLVMValueRef half = lp_build_const_v4sf(0.5);
1662   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1663   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1664   LLVMValueRef one = lp_build_const_v4sf(1.0);
1665   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1666
1667   /*
1668    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1669    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1670    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1671    */
1672   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1673   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1674   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1675
1676   /*
1677    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1678    *
1679    * y2 = *(v4sf*)_ps_sincof_p0;
1680    * y2 = _mm_mul_ps(y2, z);
1681    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1682    * y2 = _mm_mul_ps(y2, z);
1683    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1684    * y2 = _mm_mul_ps(y2, z);
1685    * y2 = _mm_mul_ps(y2, x);
1686    * y2 = _mm_add_ps(y2, x);
1687    */
1688
1689   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1690   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1691   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1692   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1693   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1694   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1695   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1696
1697   /*
1698    * select the correct result from the two polynoms
1699    * xmm3 = poly_mask;
1700    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1701    * y = _mm_andnot_ps(xmm3, y);
1702    * y = _mm_add_ps(y,y2);
1703    */
1704   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1705   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1706   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1707   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1708   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1709   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1710
1711   /*
1712    * update the sign
1713    * y = _mm_xor_ps(y, sign_bit);
1714    */
1715   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1716   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1717   return y_result;
1718}
1719
1720
1721/**
1722 * Generate pow(x, y)
1723 */
1724LLVMValueRef
1725lp_build_pow(struct lp_build_context *bld,
1726             LLVMValueRef x,
1727             LLVMValueRef y)
1728{
1729   /* TODO: optimize the constant case */
1730   if(LLVMIsConstant(x) && LLVMIsConstant(y))
1731      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1732                   __FUNCTION__);
1733
1734   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1735}
1736
1737
1738/**
1739 * Generate exp(x)
1740 */
1741LLVMValueRef
1742lp_build_exp(struct lp_build_context *bld,
1743             LLVMValueRef x)
1744{
1745   /* log2(e) = 1/log(2) */
1746   LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1747
1748   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1749}
1750
1751
1752/**
1753 * Generate log(x)
1754 */
1755LLVMValueRef
1756lp_build_log(struct lp_build_context *bld,
1757             LLVMValueRef x)
1758{
1759   /* log(2) */
1760   LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1761
1762   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1763}
1764
1765
1766#define EXP_POLY_DEGREE 3
1767#define LOG_POLY_DEGREE 5
1768
1769
1770/**
1771 * Generate polynomial.
1772 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1773 */
1774static LLVMValueRef
1775lp_build_polynomial(struct lp_build_context *bld,
1776                    LLVMValueRef x,
1777                    const double *coeffs,
1778                    unsigned num_coeffs)
1779{
1780   const struct lp_type type = bld->type;
1781   LLVMValueRef res = NULL;
1782   unsigned i;
1783
1784   /* TODO: optimize the constant case */
1785   if(LLVMIsConstant(x))
1786      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1787                   __FUNCTION__);
1788
1789   for (i = num_coeffs; i--; ) {
1790      LLVMValueRef coeff;
1791
1792      coeff = lp_build_const_vec(type, coeffs[i]);
1793
1794      if(res)
1795         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1796      else
1797         res = coeff;
1798   }
1799
1800   if(res)
1801      return res;
1802   else
1803      return bld->undef;
1804}
1805
1806
1807/**
1808 * Minimax polynomial fit of 2**x, in range [0, 1[
1809 */
1810const double lp_build_exp2_polynomial[] = {
1811#if EXP_POLY_DEGREE == 5
1812   0.999999999690134838155,
1813   0.583974334321735217258,
1814   0.164553105719676828492,
1815   0.0292811063701710962255,
1816   0.00354944426657875141846,
1817   0.000296253726543423377365
1818#elif EXP_POLY_DEGREE == 4
1819   1.00000001502262084505,
1820   0.563586057338685991394,
1821   0.150436017652442413623,
1822   0.0243220604213317927308,
1823   0.0025359088446580436489
1824#elif EXP_POLY_DEGREE == 3
1825   0.999925218562710312959,
1826   0.695833540494823811697,
1827   0.226067155427249155588,
1828   0.0780245226406372992967
1829#elif EXP_POLY_DEGREE == 2
1830   1.00172476321474503578,
1831   0.657636275736077639316,
1832   0.33718943461968720704
1833#else
1834#error
1835#endif
1836};
1837
1838
1839void
1840lp_build_exp2_approx(struct lp_build_context *bld,
1841                     LLVMValueRef x,
1842                     LLVMValueRef *p_exp2_int_part,
1843                     LLVMValueRef *p_frac_part,
1844                     LLVMValueRef *p_exp2)
1845{
1846   const struct lp_type type = bld->type;
1847   LLVMTypeRef vec_type = lp_build_vec_type(type);
1848   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1849   LLVMValueRef ipart = NULL;
1850   LLVMValueRef fpart = NULL;
1851   LLVMValueRef expipart = NULL;
1852   LLVMValueRef expfpart = NULL;
1853   LLVMValueRef res = NULL;
1854
1855   if(p_exp2_int_part || p_frac_part || p_exp2) {
1856      /* TODO: optimize the constant case */
1857      if(LLVMIsConstant(x))
1858         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1859                      __FUNCTION__);
1860
1861      assert(type.floating && type.width == 32);
1862
1863      x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1864      x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1865
1866      /* ipart = floor(x) */
1867      ipart = lp_build_floor(bld, x);
1868
1869      /* fpart = x - ipart */
1870      fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1871   }
1872
1873   if(p_exp2_int_part || p_exp2) {
1874      /* expipart = (float) (1 << ipart) */
1875      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1876      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1877      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1878      expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1879   }
1880
1881   if(p_exp2) {
1882      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1883                                     Elements(lp_build_exp2_polynomial));
1884
1885      res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
1886   }
1887
1888   if(p_exp2_int_part)
1889      *p_exp2_int_part = expipart;
1890
1891   if(p_frac_part)
1892      *p_frac_part = fpart;
1893
1894   if(p_exp2)
1895      *p_exp2 = res;
1896}
1897
1898
1899LLVMValueRef
1900lp_build_exp2(struct lp_build_context *bld,
1901              LLVMValueRef x)
1902{
1903   LLVMValueRef res;
1904   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1905   return res;
1906}
1907
1908
1909/**
1910 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1911 * These coefficients can be generate with
1912 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1913 */
1914const double lp_build_log2_polynomial[] = {
1915#if LOG_POLY_DEGREE == 6
1916   3.11578814719469302614,
1917   -3.32419399085241980044,
1918   2.59883907202499966007,
1919   -1.23152682416275988241,
1920   0.318212422185251071475,
1921   -0.0344359067839062357313
1922#elif LOG_POLY_DEGREE == 5
1923   2.8882704548164776201,
1924   -2.52074962577807006663,
1925   1.48116647521213171641,
1926   -0.465725644288844778798,
1927   0.0596515482674574969533
1928#elif LOG_POLY_DEGREE == 4
1929   2.61761038894603480148,
1930   -1.75647175389045657003,
1931   0.688243882994381274313,
1932   -0.107254423828329604454
1933#elif LOG_POLY_DEGREE == 3
1934   2.28330284476918490682,
1935   -1.04913055217340124191,
1936   0.204446009836232697516
1937#else
1938#error
1939#endif
1940};
1941
1942
1943/**
1944 * See http://www.devmaster.net/forums/showthread.php?p=43580
1945 */
1946void
1947lp_build_log2_approx(struct lp_build_context *bld,
1948                     LLVMValueRef x,
1949                     LLVMValueRef *p_exp,
1950                     LLVMValueRef *p_floor_log2,
1951                     LLVMValueRef *p_log2)
1952{
1953   const struct lp_type type = bld->type;
1954   LLVMTypeRef vec_type = lp_build_vec_type(type);
1955   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1956
1957   LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1958   LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1959   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1960
1961   LLVMValueRef i = NULL;
1962   LLVMValueRef exp = NULL;
1963   LLVMValueRef mant = NULL;
1964   LLVMValueRef logexp = NULL;
1965   LLVMValueRef logmant = NULL;
1966   LLVMValueRef res = NULL;
1967
1968   if(p_exp || p_floor_log2 || p_log2) {
1969      /* TODO: optimize the constant case */
1970      if(LLVMIsConstant(x))
1971         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1972                      __FUNCTION__);
1973
1974      assert(type.floating && type.width == 32);
1975
1976      i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1977
1978      /* exp = (float) exponent(x) */
1979      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1980   }
1981
1982   if(p_floor_log2 || p_log2) {
1983      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1984      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1985      logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1986   }
1987
1988   if(p_log2) {
1989      /* mant = (float) mantissa(x) */
1990      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1991      mant = LLVMBuildOr(bld->builder, mant, one, "");
1992      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1993
1994      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1995                                    Elements(lp_build_log2_polynomial));
1996
1997      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1998      logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
1999
2000      res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2001   }
2002
2003   if(p_exp) {
2004      exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2005      *p_exp = exp;
2006   }
2007
2008   if(p_floor_log2)
2009      *p_floor_log2 = logexp;
2010
2011   if(p_log2)
2012      *p_log2 = res;
2013}
2014
2015
2016LLVMValueRef
2017lp_build_log2(struct lp_build_context *bld,
2018              LLVMValueRef x)
2019{
2020   LLVMValueRef res;
2021   lp_build_log2_approx(bld, x, NULL, NULL, &res);
2022   return res;
2023}
2024