lp_bld_arit.c revision 3c929e55449410f97c7d9213d09aa88ef02c888c
1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_intr.h"
57#include "lp_bld_init.h" /* for lp_build_engine */
58#include "lp_bld_logic.h"
59#include "lp_bld_pack.h"
60#include "lp_bld_debug.h"
61#include "lp_bld_arit.h"
62#include "lp_bld_printf.h"
63
64
65/**
66 * Generate min(a, b)
67 * No checks for special case values of a or b = 1 or 0 are done.
68 */
69static LLVMValueRef
70lp_build_min_simple(struct lp_build_context *bld,
71                    LLVMValueRef a,
72                    LLVMValueRef b)
73{
74   const struct lp_type type = bld->type;
75   const char *intrinsic = NULL;
76   LLVMValueRef cond;
77
78   /* TODO: optimize the constant case */
79
80   if(type.width * type.length == 128) {
81      if(type.floating) {
82         if(type.width == 32 && util_cpu_caps.has_sse)
83            intrinsic = "llvm.x86.sse.min.ps";
84         if(type.width == 64 && util_cpu_caps.has_sse2)
85            intrinsic = "llvm.x86.sse2.min.pd";
86      }
87      else {
88         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
89            intrinsic = "llvm.x86.sse2.pminu.b";
90         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
91            intrinsic = "llvm.x86.sse41.pminsb";
92         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
93            intrinsic = "llvm.x86.sse41.pminuw";
94         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
95            intrinsic = "llvm.x86.sse2.pmins.w";
96         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
97            intrinsic = "llvm.x86.sse41.pminud";
98         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
99            intrinsic = "llvm.x86.sse41.pminsd";
100      }
101   }
102
103   if(intrinsic)
104      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
105
106   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
107   return lp_build_select(bld, cond, a, b);
108}
109
110
111/**
112 * Generate max(a, b)
113 * No checks for special case values of a or b = 1 or 0 are done.
114 */
115static LLVMValueRef
116lp_build_max_simple(struct lp_build_context *bld,
117                    LLVMValueRef a,
118                    LLVMValueRef b)
119{
120   const struct lp_type type = bld->type;
121   const char *intrinsic = NULL;
122   LLVMValueRef cond;
123
124   /* TODO: optimize the constant case */
125
126   if(type.width * type.length == 128) {
127      if(type.floating) {
128         if(type.width == 32 && util_cpu_caps.has_sse)
129            intrinsic = "llvm.x86.sse.max.ps";
130         if(type.width == 64 && util_cpu_caps.has_sse2)
131            intrinsic = "llvm.x86.sse2.max.pd";
132      }
133      else {
134         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
135            intrinsic = "llvm.x86.sse2.pmaxu.b";
136         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
137            intrinsic = "llvm.x86.sse41.pmaxsb";
138         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
139            intrinsic = "llvm.x86.sse41.pmaxuw";
140         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
141            intrinsic = "llvm.x86.sse2.pmaxs.w";
142         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
143            intrinsic = "llvm.x86.sse41.pmaxud";
144         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
145            intrinsic = "llvm.x86.sse41.pmaxsd";
146      }
147   }
148
149   if(intrinsic)
150      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
151
152   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
153   return lp_build_select(bld, cond, a, b);
154}
155
156
157/**
158 * Generate 1 - a, or ~a depending on bld->type.
159 */
160LLVMValueRef
161lp_build_comp(struct lp_build_context *bld,
162              LLVMValueRef a)
163{
164   const struct lp_type type = bld->type;
165
166   if(a == bld->one)
167      return bld->zero;
168   if(a == bld->zero)
169      return bld->one;
170
171   if(type.norm && !type.floating && !type.fixed && !type.sign) {
172      if(LLVMIsConstant(a))
173         return LLVMConstNot(a);
174      else
175         return LLVMBuildNot(bld->builder, a, "");
176   }
177
178   if(LLVMIsConstant(a))
179      return LLVMConstSub(bld->one, a);
180   else
181      return LLVMBuildSub(bld->builder, bld->one, a, "");
182}
183
184
185/**
186 * Generate a + b
187 */
188LLVMValueRef
189lp_build_add(struct lp_build_context *bld,
190             LLVMValueRef a,
191             LLVMValueRef b)
192{
193   const struct lp_type type = bld->type;
194   LLVMValueRef res;
195
196   if(a == bld->zero)
197      return b;
198   if(b == bld->zero)
199      return a;
200   if(a == bld->undef || b == bld->undef)
201      return bld->undef;
202
203   if(bld->type.norm) {
204      const char *intrinsic = NULL;
205
206      if(a == bld->one || b == bld->one)
207        return bld->one;
208
209      if(util_cpu_caps.has_sse2 &&
210         type.width * type.length == 128 &&
211         !type.floating && !type.fixed) {
212         if(type.width == 8)
213            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
214         if(type.width == 16)
215            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
216      }
217
218      if(intrinsic)
219         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
220   }
221
222   if(LLVMIsConstant(a) && LLVMIsConstant(b))
223      res = LLVMConstAdd(a, b);
224   else
225      res = LLVMBuildAdd(bld->builder, a, b, "");
226
227   /* clamp to ceiling of 1.0 */
228   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
229      res = lp_build_min_simple(bld, res, bld->one);
230
231   /* XXX clamp to floor of -1 or 0??? */
232
233   return res;
234}
235
236
237/** Return the sum of the elements of a */
238LLVMValueRef
239lp_build_sum_vector(struct lp_build_context *bld,
240                    LLVMValueRef a)
241{
242   const struct lp_type type = bld->type;
243   LLVMValueRef index, res;
244   unsigned i;
245
246   if (a == bld->zero)
247      return bld->zero;
248   if (a == bld->undef)
249      return bld->undef;
250   assert(type.length > 1);
251
252   assert(!bld->type.norm);
253
254   index = LLVMConstInt(LLVMInt32Type(), 0, 0);
255   res = LLVMBuildExtractElement(bld->builder, a, index, "");
256
257   for (i = 1; i < type.length; i++) {
258      index = LLVMConstInt(LLVMInt32Type(), i, 0);
259      res = LLVMBuildAdd(bld->builder, res,
260                         LLVMBuildExtractElement(bld->builder, a, index, ""),
261                         "");
262   }
263
264   return res;
265}
266
267
268/**
269 * Generate a - b
270 */
271LLVMValueRef
272lp_build_sub(struct lp_build_context *bld,
273             LLVMValueRef a,
274             LLVMValueRef b)
275{
276   const struct lp_type type = bld->type;
277   LLVMValueRef res;
278
279   if(b == bld->zero)
280      return a;
281   if(a == bld->undef || b == bld->undef)
282      return bld->undef;
283   if(a == b)
284      return bld->zero;
285
286   if(bld->type.norm) {
287      const char *intrinsic = NULL;
288
289      if(b == bld->one)
290        return bld->zero;
291
292      if(util_cpu_caps.has_sse2 &&
293         type.width * type.length == 128 &&
294         !type.floating && !type.fixed) {
295         if(type.width == 8)
296            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
297         if(type.width == 16)
298            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
299      }
300
301      if(intrinsic)
302         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
303   }
304
305   if(LLVMIsConstant(a) && LLVMIsConstant(b))
306      res = LLVMConstSub(a, b);
307   else
308      res = LLVMBuildSub(bld->builder, a, b, "");
309
310   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
311      res = lp_build_max_simple(bld, res, bld->zero);
312
313   return res;
314}
315
316
317/**
318 * Normalized 8bit multiplication.
319 *
320 * - alpha plus one
321 *
322 *     makes the following approximation to the division (Sree)
323 *
324 *       a*b/255 ~= (a*(b + 1)) >> 256
325 *
326 *     which is the fastest method that satisfies the following OpenGL criteria
327 *
328 *       0*0 = 0 and 255*255 = 255
329 *
330 * - geometric series
331 *
332 *     takes the geometric series approximation to the division
333 *
334 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
335 *
336 *     in this case just the first two terms to fit in 16bit arithmetic
337 *
338 *       t/255 ~= (t + (t >> 8)) >> 8
339 *
340 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
341 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
342 *     must be used
343 *
344 * - geometric series plus rounding
345 *
346 *     when using a geometric series division instead of truncating the result
347 *     use roundoff in the approximation (Jim Blinn)
348 *
349 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
350 *
351 *     achieving the exact results
352 *
353 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
354 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
355 * @sa Michael Herf, The "double blend trick", May 2000,
356 *     http://www.stereopsis.com/doubleblend.html
357 */
358static LLVMValueRef
359lp_build_mul_u8n(LLVMBuilderRef builder,
360                 struct lp_type i16_type,
361                 LLVMValueRef a, LLVMValueRef b)
362{
363   LLVMValueRef c8;
364   LLVMValueRef ab;
365
366   c8 = lp_build_const_int_vec(i16_type, 8);
367
368#if 0
369
370   /* a*b/255 ~= (a*(b + 1)) >> 256 */
371   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
372   ab = LLVMBuildMul(builder, a, b, "");
373
374#else
375
376   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
377   ab = LLVMBuildMul(builder, a, b, "");
378   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
379   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
380
381#endif
382
383   ab = LLVMBuildLShr(builder, ab, c8, "");
384
385   return ab;
386}
387
388
389/**
390 * Generate a * b
391 */
392LLVMValueRef
393lp_build_mul(struct lp_build_context *bld,
394             LLVMValueRef a,
395             LLVMValueRef b)
396{
397   const struct lp_type type = bld->type;
398   LLVMValueRef shift;
399   LLVMValueRef res;
400
401   if(a == bld->zero)
402      return bld->zero;
403   if(a == bld->one)
404      return b;
405   if(b == bld->zero)
406      return bld->zero;
407   if(b == bld->one)
408      return a;
409   if(a == bld->undef || b == bld->undef)
410      return bld->undef;
411
412   if(!type.floating && !type.fixed && type.norm) {
413      if(type.width == 8) {
414         struct lp_type i16_type = lp_wider_type(type);
415         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
416
417         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
418         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
419
420         /* PMULLW, PSRLW, PADDW */
421         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
422         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
423
424         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
425
426         return ab;
427      }
428
429      /* FIXME */
430      assert(0);
431   }
432
433   if(type.fixed)
434      shift = lp_build_const_int_vec(type, type.width/2);
435   else
436      shift = NULL;
437
438   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
439      res =  LLVMConstMul(a, b);
440      if(shift) {
441         if(type.sign)
442            res = LLVMConstAShr(res, shift);
443         else
444            res = LLVMConstLShr(res, shift);
445      }
446   }
447   else {
448      res = LLVMBuildMul(bld->builder, a, b, "");
449      if(shift) {
450         if(type.sign)
451            res = LLVMBuildAShr(bld->builder, res, shift, "");
452         else
453            res = LLVMBuildLShr(bld->builder, res, shift, "");
454      }
455   }
456
457   return res;
458}
459
460
461/**
462 * Small vector x scale multiplication optimization.
463 */
464LLVMValueRef
465lp_build_mul_imm(struct lp_build_context *bld,
466                 LLVMValueRef a,
467                 int b)
468{
469   LLVMValueRef factor;
470
471   if(b == 0)
472      return bld->zero;
473
474   if(b == 1)
475      return a;
476
477   if(b == -1)
478      return LLVMBuildNeg(bld->builder, a, "");
479
480   if(b == 2 && bld->type.floating)
481      return lp_build_add(bld, a, a);
482
483   if(util_is_pot(b)) {
484      unsigned shift = ffs(b) - 1;
485
486      if(bld->type.floating) {
487#if 0
488         /*
489          * Power of two multiplication by directly manipulating the mantissa.
490          *
491          * XXX: This might not be always faster, it will introduce a small error
492          * for multiplication by zero, and it will produce wrong results
493          * for Inf and NaN.
494          */
495         unsigned mantissa = lp_mantissa(bld->type);
496         factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
497         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
498         a = LLVMBuildAdd(bld->builder, a, factor, "");
499         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
500         return a;
501#endif
502      }
503      else {
504         factor = lp_build_const_vec(bld->type, shift);
505         return LLVMBuildShl(bld->builder, a, factor, "");
506      }
507   }
508
509   factor = lp_build_const_vec(bld->type, (double)b);
510   return lp_build_mul(bld, a, factor);
511}
512
513
514/**
515 * Generate a / b
516 */
517LLVMValueRef
518lp_build_div(struct lp_build_context *bld,
519             LLVMValueRef a,
520             LLVMValueRef b)
521{
522   const struct lp_type type = bld->type;
523
524   if(a == bld->zero)
525      return bld->zero;
526   if(a == bld->one)
527      return lp_build_rcp(bld, b);
528   if(b == bld->zero)
529      return bld->undef;
530   if(b == bld->one)
531      return a;
532   if(a == bld->undef || b == bld->undef)
533      return bld->undef;
534
535   if(LLVMIsConstant(a) && LLVMIsConstant(b))
536      return LLVMConstFDiv(a, b);
537
538   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
539      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
540
541   return LLVMBuildFDiv(bld->builder, a, b, "");
542}
543
544
545/**
546 * Linear interpolation.
547 *
548 * This also works for integer values with a few caveats.
549 *
550 * @sa http://www.stereopsis.com/doubleblend.html
551 */
552LLVMValueRef
553lp_build_lerp(struct lp_build_context *bld,
554              LLVMValueRef x,
555              LLVMValueRef v0,
556              LLVMValueRef v1)
557{
558   LLVMValueRef delta;
559   LLVMValueRef res;
560
561   delta = lp_build_sub(bld, v1, v0);
562
563   res = lp_build_mul(bld, x, delta);
564
565   res = lp_build_add(bld, v0, res);
566
567   if(bld->type.fixed)
568      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
569       * but it will be wrong for other uses. Basically we need a more
570       * powerful lp_type, capable of further distinguishing the values
571       * interpretation from the value storage. */
572      res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
573
574   return res;
575}
576
577
578LLVMValueRef
579lp_build_lerp_2d(struct lp_build_context *bld,
580                 LLVMValueRef x,
581                 LLVMValueRef y,
582                 LLVMValueRef v00,
583                 LLVMValueRef v01,
584                 LLVMValueRef v10,
585                 LLVMValueRef v11)
586{
587   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
588   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
589   return lp_build_lerp(bld, y, v0, v1);
590}
591
592
593/**
594 * Generate min(a, b)
595 * Do checks for special cases.
596 */
597LLVMValueRef
598lp_build_min(struct lp_build_context *bld,
599             LLVMValueRef a,
600             LLVMValueRef b)
601{
602   if(a == bld->undef || b == bld->undef)
603      return bld->undef;
604
605   if(a == b)
606      return a;
607
608   if(bld->type.norm) {
609      if(a == bld->zero || b == bld->zero)
610         return bld->zero;
611      if(a == bld->one)
612         return b;
613      if(b == bld->one)
614         return a;
615   }
616
617   return lp_build_min_simple(bld, a, b);
618}
619
620
621/**
622 * Generate max(a, b)
623 * Do checks for special cases.
624 */
625LLVMValueRef
626lp_build_max(struct lp_build_context *bld,
627             LLVMValueRef a,
628             LLVMValueRef b)
629{
630   if(a == bld->undef || b == bld->undef)
631      return bld->undef;
632
633   if(a == b)
634      return a;
635
636   if(bld->type.norm) {
637      if(a == bld->one || b == bld->one)
638         return bld->one;
639      if(a == bld->zero)
640         return b;
641      if(b == bld->zero)
642         return a;
643   }
644
645   return lp_build_max_simple(bld, a, b);
646}
647
648
649/**
650 * Generate clamp(a, min, max)
651 * Do checks for special cases.
652 */
653LLVMValueRef
654lp_build_clamp(struct lp_build_context *bld,
655               LLVMValueRef a,
656               LLVMValueRef min,
657               LLVMValueRef max)
658{
659   a = lp_build_min(bld, a, max);
660   a = lp_build_max(bld, a, min);
661   return a;
662}
663
664
665/**
666 * Generate abs(a)
667 */
668LLVMValueRef
669lp_build_abs(struct lp_build_context *bld,
670             LLVMValueRef a)
671{
672   const struct lp_type type = bld->type;
673   LLVMTypeRef vec_type = lp_build_vec_type(type);
674
675   if(!type.sign)
676      return a;
677
678   if(type.floating) {
679      /* Mask out the sign bit */
680      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
681      unsigned long long absMask = ~(1ULL << (type.width - 1));
682      LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
683      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
684      a = LLVMBuildAnd(bld->builder, a, mask, "");
685      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
686      return a;
687   }
688
689   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
690      switch(type.width) {
691      case 8:
692         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
693      case 16:
694         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
695      case 32:
696         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
697      }
698   }
699
700   return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
701}
702
703
704LLVMValueRef
705lp_build_negate(struct lp_build_context *bld,
706                LLVMValueRef a)
707{
708   return LLVMBuildNeg(bld->builder, a, "");
709}
710
711
712/** Return -1, 0 or +1 depending on the sign of a */
713LLVMValueRef
714lp_build_sgn(struct lp_build_context *bld,
715             LLVMValueRef a)
716{
717   const struct lp_type type = bld->type;
718   LLVMValueRef cond;
719   LLVMValueRef res;
720
721   /* Handle non-zero case */
722   if(!type.sign) {
723      /* if not zero then sign must be positive */
724      res = bld->one;
725   }
726   else if(type.floating) {
727      LLVMTypeRef vec_type;
728      LLVMTypeRef int_type;
729      LLVMValueRef mask;
730      LLVMValueRef sign;
731      LLVMValueRef one;
732      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
733
734      int_type = lp_build_int_vec_type(type);
735      vec_type = lp_build_vec_type(type);
736      mask = lp_build_const_int_vec(type, maskBit);
737
738      /* Take the sign bit and add it to 1 constant */
739      sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
740      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
741      one = LLVMConstBitCast(bld->one, int_type);
742      res = LLVMBuildOr(bld->builder, sign, one, "");
743      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
744   }
745   else
746   {
747      LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
748      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
749      res = lp_build_select(bld, cond, bld->one, minus_one);
750   }
751
752   /* Handle zero */
753   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
754   res = lp_build_select(bld, cond, bld->zero, res);
755
756   return res;
757}
758
759
760/**
761 * Set the sign of float vector 'a' according to 'sign'.
762 * If sign==0, return abs(a).
763 * If sign==1, return -abs(a);
764 * Other values for sign produce undefined results.
765 */
766LLVMValueRef
767lp_build_set_sign(struct lp_build_context *bld,
768                  LLVMValueRef a, LLVMValueRef sign)
769{
770   const struct lp_type type = bld->type;
771   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
772   LLVMTypeRef vec_type = lp_build_vec_type(type);
773   LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
774   LLVMValueRef mask = lp_build_const_int_vec(type,
775                             ~((unsigned long long) 1 << (type.width - 1)));
776   LLVMValueRef val, res;
777
778   assert(type.floating);
779
780   /* val = reinterpret_cast<int>(a) */
781   val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
782   /* val = val & mask */
783   val = LLVMBuildAnd(bld->builder, val, mask, "");
784   /* sign = sign << shift */
785   sign = LLVMBuildShl(bld->builder, sign, shift, "");
786   /* res = val | sign */
787   res = LLVMBuildOr(bld->builder, val, sign, "");
788   /* res = reinterpret_cast<float>(res) */
789   res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
790
791   return res;
792}
793
794
795/**
796 * Convert vector of (or scalar) int to vector of (or scalar) float.
797 */
798LLVMValueRef
799lp_build_int_to_float(struct lp_build_context *bld,
800                      LLVMValueRef a)
801{
802   const struct lp_type type = bld->type;
803   LLVMTypeRef vec_type = lp_build_vec_type(type);
804
805   assert(type.floating);
806
807   return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
808}
809
810
811
812enum lp_build_round_sse41_mode
813{
814   LP_BUILD_ROUND_SSE41_NEAREST = 0,
815   LP_BUILD_ROUND_SSE41_FLOOR = 1,
816   LP_BUILD_ROUND_SSE41_CEIL = 2,
817   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
818};
819
820
821static INLINE LLVMValueRef
822lp_build_round_sse41(struct lp_build_context *bld,
823                     LLVMValueRef a,
824                     enum lp_build_round_sse41_mode mode)
825{
826   const struct lp_type type = bld->type;
827   LLVMTypeRef vec_type = lp_build_vec_type(type);
828   const char *intrinsic;
829
830   assert(type.floating);
831   assert(type.width*type.length == 128);
832   assert(lp_check_value(type, a));
833   assert(util_cpu_caps.has_sse4_1);
834
835   switch(type.width) {
836   case 32:
837      intrinsic = "llvm.x86.sse41.round.ps";
838      break;
839   case 64:
840      intrinsic = "llvm.x86.sse41.round.pd";
841      break;
842   default:
843      assert(0);
844      return bld->undef;
845   }
846
847   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
848                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
849}
850
851
852LLVMValueRef
853lp_build_trunc(struct lp_build_context *bld,
854               LLVMValueRef a)
855{
856   const struct lp_type type = bld->type;
857
858   assert(type.floating);
859   assert(lp_check_value(type, a));
860
861   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
862      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
863   else {
864      LLVMTypeRef vec_type = lp_build_vec_type(type);
865      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
866      LLVMValueRef res;
867      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
868      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
869      return res;
870   }
871}
872
873
874LLVMValueRef
875lp_build_round(struct lp_build_context *bld,
876               LLVMValueRef a)
877{
878   const struct lp_type type = bld->type;
879
880   assert(type.floating);
881   assert(lp_check_value(type, a));
882
883   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
884      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
885   else {
886      LLVMTypeRef vec_type = lp_build_vec_type(type);
887      LLVMValueRef res;
888      res = lp_build_iround(bld, a);
889      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
890      return res;
891   }
892}
893
894
895LLVMValueRef
896lp_build_floor(struct lp_build_context *bld,
897               LLVMValueRef a)
898{
899   const struct lp_type type = bld->type;
900
901   assert(type.floating);
902   assert(lp_check_value(type, a));
903
904   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
905      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
906   else {
907      LLVMTypeRef vec_type = lp_build_vec_type(type);
908      LLVMValueRef res;
909      res = lp_build_ifloor(bld, a);
910      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
911      return res;
912   }
913}
914
915
916LLVMValueRef
917lp_build_ceil(struct lp_build_context *bld,
918              LLVMValueRef a)
919{
920   const struct lp_type type = bld->type;
921
922   assert(type.floating);
923   assert(lp_check_value(type, a));
924
925   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
926      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
927   else {
928      LLVMTypeRef vec_type = lp_build_vec_type(type);
929      LLVMValueRef res;
930      res = lp_build_iceil(bld, a);
931      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
932      return res;
933   }
934}
935
936
937/**
938 * Return fractional part of 'a' computed as a - floor(f)
939 * Typically used in texture coord arithmetic.
940 */
941LLVMValueRef
942lp_build_fract(struct lp_build_context *bld,
943               LLVMValueRef a)
944{
945   assert(bld->type.floating);
946   return lp_build_sub(bld, a, lp_build_floor(bld, a));
947}
948
949
950/**
951 * Convert to integer, through whichever rounding method that's fastest,
952 * typically truncating toward zero.
953 */
954LLVMValueRef
955lp_build_itrunc(struct lp_build_context *bld,
956                LLVMValueRef a)
957{
958   const struct lp_type type = bld->type;
959   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
960
961   assert(type.floating);
962   assert(lp_check_value(type, a));
963
964   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
965}
966
967
968/**
969 * Convert float[] to int[] with round().
970 */
971LLVMValueRef
972lp_build_iround(struct lp_build_context *bld,
973                LLVMValueRef a)
974{
975   const struct lp_type type = bld->type;
976   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
977   LLVMValueRef res;
978
979   assert(type.floating);
980
981   assert(lp_check_value(type, a));
982
983   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
984      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
985   }
986   else {
987      LLVMTypeRef vec_type = lp_build_vec_type(type);
988      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
989      LLVMValueRef sign;
990      LLVMValueRef half;
991
992      /* get sign bit */
993      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
994      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
995
996      /* sign * 0.5 */
997      half = lp_build_const_vec(type, 0.5);
998      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
999      half = LLVMBuildOr(bld->builder, sign, half, "");
1000      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1001
1002      res = LLVMBuildAdd(bld->builder, a, half, "");
1003   }
1004
1005   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1006
1007   return res;
1008}
1009
1010
1011/**
1012 * Convert float[] to int[] with floor().
1013 */
1014LLVMValueRef
1015lp_build_ifloor(struct lp_build_context *bld,
1016                LLVMValueRef a)
1017{
1018   const struct lp_type type = bld->type;
1019   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1020   LLVMValueRef res;
1021
1022   assert(type.floating);
1023   assert(lp_check_value(type, a));
1024
1025   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1026      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1027   }
1028   else {
1029      /* Take the sign bit and add it to 1 constant */
1030      LLVMTypeRef vec_type = lp_build_vec_type(type);
1031      unsigned mantissa = lp_mantissa(type);
1032      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1033      LLVMValueRef sign;
1034      LLVMValueRef offset;
1035
1036      /* sign = a < 0 ? ~0 : 0 */
1037      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1038      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1039      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
1040      lp_build_name(sign, "floor.sign");
1041
1042      /* offset = -0.99999(9)f */
1043      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
1044      offset = LLVMConstBitCast(offset, int_vec_type);
1045
1046      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1047      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1048      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
1049      lp_build_name(offset, "floor.offset");
1050
1051      res = LLVMBuildAdd(bld->builder, a, offset, "");
1052      lp_build_name(res, "floor.res");
1053   }
1054
1055   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1056   lp_build_name(res, "floor");
1057
1058   return res;
1059}
1060
1061
1062LLVMValueRef
1063lp_build_iceil(struct lp_build_context *bld,
1064               LLVMValueRef a)
1065{
1066   const struct lp_type type = bld->type;
1067   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1068   LLVMValueRef res;
1069
1070   assert(type.floating);
1071   assert(lp_check_value(type, a));
1072
1073   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1074      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1075   }
1076   else {
1077      /* TODO: mimic lp_build_ifloor() here */
1078      assert(0);
1079      res = bld->undef;
1080   }
1081
1082   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1083
1084   return res;
1085}
1086
1087
1088LLVMValueRef
1089lp_build_sqrt(struct lp_build_context *bld,
1090              LLVMValueRef a)
1091{
1092   const struct lp_type type = bld->type;
1093   LLVMTypeRef vec_type = lp_build_vec_type(type);
1094   char intrinsic[32];
1095
1096   /* TODO: optimize the constant case */
1097   /* TODO: optimize the constant case */
1098
1099   assert(type.floating);
1100   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1101
1102   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1103}
1104
1105
1106LLVMValueRef
1107lp_build_rcp(struct lp_build_context *bld,
1108             LLVMValueRef a)
1109{
1110   const struct lp_type type = bld->type;
1111
1112   if(a == bld->zero)
1113      return bld->undef;
1114   if(a == bld->one)
1115      return bld->one;
1116   if(a == bld->undef)
1117      return bld->undef;
1118
1119   assert(type.floating);
1120
1121   if(LLVMIsConstant(a))
1122      return LLVMConstFDiv(bld->one, a);
1123
1124   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1125      /*
1126       * XXX: Added precision is not always necessary, so only enable this
1127       * when we have a better system in place to track minimum precision.
1128       */
1129
1130#if 0
1131      /*
1132       * Do one Newton-Raphson step to improve precision:
1133       *
1134       *   x1 = (2 - a * rcp(a)) * rcp(a)
1135       */
1136
1137      LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1138      LLVMValueRef rcp_a;
1139      LLVMValueRef res;
1140
1141      rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1142
1143      res = LLVMBuildMul(bld->builder, a, rcp_a, "");
1144      res = LLVMBuildSub(bld->builder, two, res, "");
1145      res = LLVMBuildMul(bld->builder, res, rcp_a, "");
1146
1147      return rcp_a;
1148#else
1149      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1150#endif
1151   }
1152
1153   return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1154}
1155
1156
1157/**
1158 * Generate 1/sqrt(a)
1159 */
1160LLVMValueRef
1161lp_build_rsqrt(struct lp_build_context *bld,
1162               LLVMValueRef a)
1163{
1164   const struct lp_type type = bld->type;
1165
1166   assert(type.floating);
1167
1168   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1169      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1170
1171   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1172}
1173
1174
1175static inline LLVMValueRef
1176lp_build_const_v4si(unsigned long value)
1177{
1178   LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1179   LLVMValueRef elements[4] = { element, element, element, element };
1180   return LLVMConstVector(elements, 4);
1181}
1182
1183static inline LLVMValueRef
1184lp_build_const_v4sf(float value)
1185{
1186   LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1187   LLVMValueRef elements[4] = { element, element, element, element };
1188   return LLVMConstVector(elements, 4);
1189}
1190
1191
1192/**
1193 * Generate sin(a) using SSE2
1194 */
1195LLVMValueRef
1196lp_build_sin(struct lp_build_context *bld,
1197             LLVMValueRef a)
1198{
1199   struct lp_type int_type = lp_int_type(bld->type);
1200   LLVMBuilderRef b = bld->builder;
1201   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1202   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1203
1204   /*
1205    *  take the absolute value,
1206    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1207    */
1208
1209   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1210   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1211
1212   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1213   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1214
1215   /*
1216    * extract the sign bit (upper one)
1217    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1218    */
1219   LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1220   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1221
1222   /*
1223    * scale by 4/Pi
1224    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1225    */
1226
1227   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1228   LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
1229
1230   /*
1231    * store the integer part of y in mm0
1232    * emm2 = _mm_cvttps_epi32(y);
1233    */
1234
1235   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1236
1237   /*
1238    * j=(j+1) & (~1) (see the cephes sources)
1239    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1240    */
1241
1242   LLVMValueRef all_one = lp_build_const_v4si(1);
1243   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1244   /*
1245    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1246    */
1247   LLVMValueRef inv_one = lp_build_const_v4si(~1);
1248   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1249
1250   /*
1251    * y = _mm_cvtepi32_ps(emm2);
1252    */
1253   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1254
1255   /* get the swap sign flag
1256    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1257    */
1258   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1259   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1260
1261   /*
1262    * emm2 = _mm_slli_epi32(emm0, 29);
1263    */
1264   LLVMValueRef const_29 = lp_build_const_v4si(29);
1265   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1266
1267   /*
1268    * get the polynom selection mask
1269    * there is one polynom for 0 <= x <= Pi/4
1270    * and another one for Pi/4<x<=Pi/2
1271    * Both branches will be computed.
1272    *
1273    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1274    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1275    */
1276
1277   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1278   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1279   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1280                                             emm2_3, lp_build_const_v4si(0));
1281   /*
1282    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1283    */
1284   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1285
1286   /*
1287    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1288    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1289    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1290    */
1291   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1292   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1293   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1294
1295   /*
1296    * The magic pass: "Extended precision modular arithmetic"
1297    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1298    * xmm1 = _mm_mul_ps(y, xmm1);
1299    * xmm2 = _mm_mul_ps(y, xmm2);
1300    * xmm3 = _mm_mul_ps(y, xmm3);
1301    */
1302   LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
1303   LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
1304   LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
1305
1306   /*
1307    * x = _mm_add_ps(x, xmm1);
1308    * x = _mm_add_ps(x, xmm2);
1309    * x = _mm_add_ps(x, xmm3);
1310    */
1311
1312   LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
1313   LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
1314   LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
1315
1316   /*
1317    * Evaluate the first polynom  (0 <= x <= Pi/4)
1318    *
1319    * z = _mm_mul_ps(x,x);
1320    */
1321   LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
1322
1323   /*
1324    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1325    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1326    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1327    */
1328   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1329   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1330   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1331
1332   /*
1333    * y = *(v4sf*)_ps_coscof_p0;
1334    * y = _mm_mul_ps(y, z);
1335    */
1336   LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
1337   LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
1338   LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
1339   LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
1340   LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
1341   LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
1342
1343
1344   /*
1345    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1346    * y = _mm_sub_ps(y, tmp);
1347    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1348    */
1349   LLVMValueRef half = lp_build_const_v4sf(0.5);
1350   LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
1351   LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
1352   LLVMValueRef one = lp_build_const_v4sf(1.0);
1353   LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
1354
1355   /*
1356    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1357    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1358    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1359    */
1360   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1361   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1362   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1363
1364   /*
1365    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1366    *
1367    * y2 = *(v4sf*)_ps_sincof_p0;
1368    * y2 = _mm_mul_ps(y2, z);
1369    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1370    * y2 = _mm_mul_ps(y2, z);
1371    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1372    * y2 = _mm_mul_ps(y2, z);
1373    * y2 = _mm_mul_ps(y2, x);
1374    * y2 = _mm_add_ps(y2, x);
1375    */
1376
1377   LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
1378   LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
1379   LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
1380   LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
1381   LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
1382   LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
1383   LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
1384
1385   /*
1386    * select the correct result from the two polynoms
1387    * xmm3 = poly_mask;
1388    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1389    * y = _mm_andnot_ps(xmm3, y);
1390    * y = _mm_add_ps(y,y2);
1391    */
1392   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1393   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1394   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1395   LLVMValueRef inv = lp_build_const_v4si(~0);
1396   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1397   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1398   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1399
1400   /*
1401    * update the sign
1402    * y = _mm_xor_ps(y, sign_bit);
1403    */
1404   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1405   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1406   return y_result;
1407}
1408
1409
1410/**
1411 * Generate cos(a) using SSE2
1412 */
1413LLVMValueRef
1414lp_build_cos(struct lp_build_context *bld,
1415             LLVMValueRef a)
1416{
1417   struct lp_type int_type = lp_int_type(bld->type);
1418   LLVMBuilderRef b = bld->builder;
1419   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1420   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1421
1422   /*
1423    *  take the absolute value,
1424    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1425    */
1426
1427   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1428   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1429
1430   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1431   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1432
1433   /*
1434    * scale by 4/Pi
1435    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1436    */
1437
1438   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1439   LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
1440
1441   /*
1442    * store the integer part of y in mm0
1443    * emm2 = _mm_cvttps_epi32(y);
1444    */
1445
1446   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1447
1448   /*
1449    * j=(j+1) & (~1) (see the cephes sources)
1450    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1451    */
1452
1453   LLVMValueRef all_one = lp_build_const_v4si(1);
1454   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1455   /*
1456    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1457    */
1458   LLVMValueRef inv_one = lp_build_const_v4si(~1);
1459   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1460
1461   /*
1462    * y = _mm_cvtepi32_ps(emm2);
1463    */
1464   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1465
1466
1467   /*
1468    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1469    */
1470   LLVMValueRef const_2 = lp_build_const_v4si(2);
1471   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1472
1473
1474   /* get the swap sign flag
1475    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1476    */
1477   LLVMValueRef inv = lp_build_const_v4si(~0);
1478   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1479   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1480   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1481
1482   /*
1483    * emm2 = _mm_slli_epi32(emm0, 29);
1484    */
1485   LLVMValueRef const_29 = lp_build_const_v4si(29);
1486   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1487
1488   /*
1489    * get the polynom selection mask
1490    * there is one polynom for 0 <= x <= Pi/4
1491    * and another one for Pi/4<x<=Pi/2
1492    * Both branches will be computed.
1493    *
1494    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1495    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1496    */
1497
1498   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1499   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1500   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1501   				             emm2_3, lp_build_const_v4si(0));
1502
1503   /*
1504    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1505    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1506    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1507    */
1508   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1509   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1510   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1511
1512   /*
1513    * The magic pass: "Extended precision modular arithmetic"
1514    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1515    * xmm1 = _mm_mul_ps(y, xmm1);
1516    * xmm2 = _mm_mul_ps(y, xmm2);
1517    * xmm3 = _mm_mul_ps(y, xmm3);
1518    */
1519   LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
1520   LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
1521   LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
1522
1523   /*
1524    * x = _mm_add_ps(x, xmm1);
1525    * x = _mm_add_ps(x, xmm2);
1526    * x = _mm_add_ps(x, xmm3);
1527    */
1528
1529   LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
1530   LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
1531   LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
1532
1533   /*
1534    * Evaluate the first polynom  (0 <= x <= Pi/4)
1535    *
1536    * z = _mm_mul_ps(x,x);
1537    */
1538   LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
1539
1540   /*
1541    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1542    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1543    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1544    */
1545   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1546   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1547   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1548
1549   /*
1550    * y = *(v4sf*)_ps_coscof_p0;
1551    * y = _mm_mul_ps(y, z);
1552    */
1553   LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
1554   LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
1555   LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
1556   LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
1557   LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
1558   LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
1559
1560
1561   /*
1562    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1563    * y = _mm_sub_ps(y, tmp);
1564    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1565    */
1566   LLVMValueRef half = lp_build_const_v4sf(0.5);
1567   LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
1568   LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
1569   LLVMValueRef one = lp_build_const_v4sf(1.0);
1570   LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
1571
1572   /*
1573    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1574    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1575    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1576    */
1577   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1578   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1579   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1580
1581   /*
1582    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1583    *
1584    * y2 = *(v4sf*)_ps_sincof_p0;
1585    * y2 = _mm_mul_ps(y2, z);
1586    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1587    * y2 = _mm_mul_ps(y2, z);
1588    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1589    * y2 = _mm_mul_ps(y2, z);
1590    * y2 = _mm_mul_ps(y2, x);
1591    * y2 = _mm_add_ps(y2, x);
1592    */
1593
1594   LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
1595   LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
1596   LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
1597   LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
1598   LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
1599   LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
1600   LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
1601
1602   /*
1603    * select the correct result from the two polynoms
1604    * xmm3 = poly_mask;
1605    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1606    * y = _mm_andnot_ps(xmm3, y);
1607    * y = _mm_add_ps(y,y2);
1608    */
1609   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1610   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1611   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1612   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1613   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1614   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1615
1616   /*
1617    * update the sign
1618    * y = _mm_xor_ps(y, sign_bit);
1619    */
1620   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1621   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1622   return y_result;
1623}
1624
1625
1626/**
1627 * Generate pow(x, y)
1628 */
1629LLVMValueRef
1630lp_build_pow(struct lp_build_context *bld,
1631             LLVMValueRef x,
1632             LLVMValueRef y)
1633{
1634   /* TODO: optimize the constant case */
1635   if(LLVMIsConstant(x) && LLVMIsConstant(y))
1636      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1637                   __FUNCTION__);
1638
1639   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1640}
1641
1642
1643/**
1644 * Generate exp(x)
1645 */
1646LLVMValueRef
1647lp_build_exp(struct lp_build_context *bld,
1648             LLVMValueRef x)
1649{
1650   /* log2(e) = 1/log(2) */
1651   LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1652
1653   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1654}
1655
1656
1657/**
1658 * Generate log(x)
1659 */
1660LLVMValueRef
1661lp_build_log(struct lp_build_context *bld,
1662             LLVMValueRef x)
1663{
1664   /* log(2) */
1665   LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1666
1667   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1668}
1669
1670
1671#define EXP_POLY_DEGREE 3
1672#define LOG_POLY_DEGREE 5
1673
1674
1675/**
1676 * Generate polynomial.
1677 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1678 */
1679static LLVMValueRef
1680lp_build_polynomial(struct lp_build_context *bld,
1681                    LLVMValueRef x,
1682                    const double *coeffs,
1683                    unsigned num_coeffs)
1684{
1685   const struct lp_type type = bld->type;
1686   LLVMValueRef res = NULL;
1687   unsigned i;
1688
1689   /* TODO: optimize the constant case */
1690   if(LLVMIsConstant(x))
1691      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1692                   __FUNCTION__);
1693
1694   for (i = num_coeffs; i--; ) {
1695      LLVMValueRef coeff;
1696
1697      coeff = lp_build_const_vec(type, coeffs[i]);
1698
1699      if(res)
1700         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1701      else
1702         res = coeff;
1703   }
1704
1705   if(res)
1706      return res;
1707   else
1708      return bld->undef;
1709}
1710
1711
1712/**
1713 * Minimax polynomial fit of 2**x, in range [0, 1[
1714 */
1715const double lp_build_exp2_polynomial[] = {
1716#if EXP_POLY_DEGREE == 5
1717   0.999999999690134838155,
1718   0.583974334321735217258,
1719   0.164553105719676828492,
1720   0.0292811063701710962255,
1721   0.00354944426657875141846,
1722   0.000296253726543423377365
1723#elif EXP_POLY_DEGREE == 4
1724   1.00000001502262084505,
1725   0.563586057338685991394,
1726   0.150436017652442413623,
1727   0.0243220604213317927308,
1728   0.0025359088446580436489
1729#elif EXP_POLY_DEGREE == 3
1730   0.999925218562710312959,
1731   0.695833540494823811697,
1732   0.226067155427249155588,
1733   0.0780245226406372992967
1734#elif EXP_POLY_DEGREE == 2
1735   1.00172476321474503578,
1736   0.657636275736077639316,
1737   0.33718943461968720704
1738#else
1739#error
1740#endif
1741};
1742
1743
1744void
1745lp_build_exp2_approx(struct lp_build_context *bld,
1746                     LLVMValueRef x,
1747                     LLVMValueRef *p_exp2_int_part,
1748                     LLVMValueRef *p_frac_part,
1749                     LLVMValueRef *p_exp2)
1750{
1751   const struct lp_type type = bld->type;
1752   LLVMTypeRef vec_type = lp_build_vec_type(type);
1753   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1754   LLVMValueRef ipart = NULL;
1755   LLVMValueRef fpart = NULL;
1756   LLVMValueRef expipart = NULL;
1757   LLVMValueRef expfpart = NULL;
1758   LLVMValueRef res = NULL;
1759
1760   if(p_exp2_int_part || p_frac_part || p_exp2) {
1761      /* TODO: optimize the constant case */
1762      if(LLVMIsConstant(x))
1763         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1764                      __FUNCTION__);
1765
1766      assert(type.floating && type.width == 32);
1767
1768      x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1769      x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1770
1771      /* ipart = floor(x) */
1772      ipart = lp_build_floor(bld, x);
1773
1774      /* fpart = x - ipart */
1775      fpart = LLVMBuildSub(bld->builder, x, ipart, "");
1776   }
1777
1778   if(p_exp2_int_part || p_exp2) {
1779      /* expipart = (float) (1 << ipart) */
1780      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1781      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1782      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1783      expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1784   }
1785
1786   if(p_exp2) {
1787      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1788                                     Elements(lp_build_exp2_polynomial));
1789
1790      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1791   }
1792
1793   if(p_exp2_int_part)
1794      *p_exp2_int_part = expipart;
1795
1796   if(p_frac_part)
1797      *p_frac_part = fpart;
1798
1799   if(p_exp2)
1800      *p_exp2 = res;
1801}
1802
1803
1804LLVMValueRef
1805lp_build_exp2(struct lp_build_context *bld,
1806              LLVMValueRef x)
1807{
1808   LLVMValueRef res;
1809   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1810   return res;
1811}
1812
1813
1814/**
1815 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1816 * These coefficients can be generate with
1817 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1818 */
1819const double lp_build_log2_polynomial[] = {
1820#if LOG_POLY_DEGREE == 6
1821   3.11578814719469302614,
1822   -3.32419399085241980044,
1823   2.59883907202499966007,
1824   -1.23152682416275988241,
1825   0.318212422185251071475,
1826   -0.0344359067839062357313
1827#elif LOG_POLY_DEGREE == 5
1828   2.8882704548164776201,
1829   -2.52074962577807006663,
1830   1.48116647521213171641,
1831   -0.465725644288844778798,
1832   0.0596515482674574969533
1833#elif LOG_POLY_DEGREE == 4
1834   2.61761038894603480148,
1835   -1.75647175389045657003,
1836   0.688243882994381274313,
1837   -0.107254423828329604454
1838#elif LOG_POLY_DEGREE == 3
1839   2.28330284476918490682,
1840   -1.04913055217340124191,
1841   0.204446009836232697516
1842#else
1843#error
1844#endif
1845};
1846
1847
1848/**
1849 * See http://www.devmaster.net/forums/showthread.php?p=43580
1850 */
1851void
1852lp_build_log2_approx(struct lp_build_context *bld,
1853                     LLVMValueRef x,
1854                     LLVMValueRef *p_exp,
1855                     LLVMValueRef *p_floor_log2,
1856                     LLVMValueRef *p_log2)
1857{
1858   const struct lp_type type = bld->type;
1859   LLVMTypeRef vec_type = lp_build_vec_type(type);
1860   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1861
1862   LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1863   LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1864   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1865
1866   LLVMValueRef i = NULL;
1867   LLVMValueRef exp = NULL;
1868   LLVMValueRef mant = NULL;
1869   LLVMValueRef logexp = NULL;
1870   LLVMValueRef logmant = NULL;
1871   LLVMValueRef res = NULL;
1872
1873   if(p_exp || p_floor_log2 || p_log2) {
1874      /* TODO: optimize the constant case */
1875      if(LLVMIsConstant(x))
1876         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1877                      __FUNCTION__);
1878
1879      assert(type.floating && type.width == 32);
1880
1881      i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1882
1883      /* exp = (float) exponent(x) */
1884      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1885   }
1886
1887   if(p_floor_log2 || p_log2) {
1888      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1889      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1890      logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1891   }
1892
1893   if(p_log2) {
1894      /* mant = (float) mantissa(x) */
1895      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1896      mant = LLVMBuildOr(bld->builder, mant, one, "");
1897      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1898
1899      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1900                                    Elements(lp_build_log2_polynomial));
1901
1902      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1903      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1904
1905      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1906   }
1907
1908   if(p_exp) {
1909      exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
1910      *p_exp = exp;
1911   }
1912
1913   if(p_floor_log2)
1914      *p_floor_log2 = logexp;
1915
1916   if(p_log2)
1917      *p_log2 = res;
1918}
1919
1920
1921LLVMValueRef
1922lp_build_log2(struct lp_build_context *bld,
1923              LLVMValueRef x)
1924{
1925   LLVMValueRef res;
1926   lp_build_log2_approx(bld, x, NULL, NULL, &res);
1927   return res;
1928}
1929