lp_bld_arit.c revision da5e9fce47b2029c6f6445ed53f3b5e5ff3889a0
1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_init.h"
57#include "lp_bld_intr.h"
58#include "lp_bld_logic.h"
59#include "lp_bld_pack.h"
60#include "lp_bld_debug.h"
61#include "lp_bld_arit.h"
62
63
64#define EXP_POLY_DEGREE 5
65
66#define LOG_POLY_DEGREE 5
67
68
69/**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73static LLVMValueRef
74lp_build_min_simple(struct lp_build_context *bld,
75                    LLVMValueRef a,
76                    LLVMValueRef b)
77{
78   LLVMBuilderRef builder = bld->gallivm->builder;
79   const struct lp_type type = bld->type;
80   const char *intrinsic = NULL;
81   LLVMValueRef cond;
82
83   assert(lp_check_value(type, a));
84   assert(lp_check_value(type, b));
85
86   /* TODO: optimize the constant case */
87
88   if(type.width * type.length == 128) {
89      if(type.floating) {
90         if(type.width == 32 && util_cpu_caps.has_sse)
91            intrinsic = "llvm.x86.sse.min.ps";
92         if(type.width == 64 && util_cpu_caps.has_sse2)
93            intrinsic = "llvm.x86.sse2.min.pd";
94      }
95      else {
96         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
97            intrinsic = "llvm.x86.sse2.pminu.b";
98         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
99            intrinsic = "llvm.x86.sse41.pminsb";
100         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
101            intrinsic = "llvm.x86.sse41.pminuw";
102         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
103            intrinsic = "llvm.x86.sse2.pmins.w";
104         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
105            intrinsic = "llvm.x86.sse41.pminud";
106         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
107            intrinsic = "llvm.x86.sse41.pminsd";
108      }
109   }
110
111   if(intrinsic)
112      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
113
114   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
115   return lp_build_select(bld, cond, a, b);
116}
117
118
119/**
120 * Generate max(a, b)
121 * No checks for special case values of a or b = 1 or 0 are done.
122 */
123static LLVMValueRef
124lp_build_max_simple(struct lp_build_context *bld,
125                    LLVMValueRef a,
126                    LLVMValueRef b)
127{
128   LLVMBuilderRef builder = bld->gallivm->builder;
129   const struct lp_type type = bld->type;
130   const char *intrinsic = NULL;
131   LLVMValueRef cond;
132
133   assert(lp_check_value(type, a));
134   assert(lp_check_value(type, b));
135
136   /* TODO: optimize the constant case */
137
138   if(type.width * type.length == 128) {
139      if(type.floating) {
140         if(type.width == 32 && util_cpu_caps.has_sse)
141            intrinsic = "llvm.x86.sse.max.ps";
142         if(type.width == 64 && util_cpu_caps.has_sse2)
143            intrinsic = "llvm.x86.sse2.max.pd";
144      }
145      else {
146         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
147            intrinsic = "llvm.x86.sse2.pmaxu.b";
148         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
149            intrinsic = "llvm.x86.sse41.pmaxsb";
150         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
151            intrinsic = "llvm.x86.sse41.pmaxuw";
152         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
153            intrinsic = "llvm.x86.sse2.pmaxs.w";
154         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
155            intrinsic = "llvm.x86.sse41.pmaxud";
156         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
157            intrinsic = "llvm.x86.sse41.pmaxsd";
158      }
159   }
160
161   if(intrinsic)
162      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
163
164   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
165   return lp_build_select(bld, cond, a, b);
166}
167
168
169/**
170 * Generate 1 - a, or ~a depending on bld->type.
171 */
172LLVMValueRef
173lp_build_comp(struct lp_build_context *bld,
174              LLVMValueRef a)
175{
176   LLVMBuilderRef builder = bld->gallivm->builder;
177   const struct lp_type type = bld->type;
178
179   assert(lp_check_value(type, a));
180
181   if(a == bld->one)
182      return bld->zero;
183   if(a == bld->zero)
184      return bld->one;
185
186   if(type.norm && !type.floating && !type.fixed && !type.sign) {
187      if(LLVMIsConstant(a))
188         return LLVMConstNot(a);
189      else
190         return LLVMBuildNot(builder, a, "");
191   }
192
193   if(LLVMIsConstant(a))
194      if (type.floating)
195          return LLVMConstFSub(bld->one, a);
196      else
197          return LLVMConstSub(bld->one, a);
198   else
199      if (type.floating)
200         return LLVMBuildFSub(builder, bld->one, a, "");
201      else
202         return LLVMBuildSub(builder, bld->one, a, "");
203}
204
205
206/**
207 * Generate a + b
208 */
209LLVMValueRef
210lp_build_add(struct lp_build_context *bld,
211             LLVMValueRef a,
212             LLVMValueRef b)
213{
214   LLVMBuilderRef builder = bld->gallivm->builder;
215   const struct lp_type type = bld->type;
216   LLVMValueRef res;
217
218   assert(lp_check_value(type, a));
219   assert(lp_check_value(type, b));
220
221   if(a == bld->zero)
222      return b;
223   if(b == bld->zero)
224      return a;
225   if(a == bld->undef || b == bld->undef)
226      return bld->undef;
227
228   if(bld->type.norm) {
229      const char *intrinsic = NULL;
230
231      if(a == bld->one || b == bld->one)
232        return bld->one;
233
234      if(util_cpu_caps.has_sse2 &&
235         type.width * type.length == 128 &&
236         !type.floating && !type.fixed) {
237         if(type.width == 8)
238            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
239         if(type.width == 16)
240            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
241      }
242
243      if(intrinsic)
244         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
245   }
246
247   if(LLVMIsConstant(a) && LLVMIsConstant(b))
248      if (type.floating)
249         res = LLVMConstFAdd(a, b);
250      else
251         res = LLVMConstAdd(a, b);
252   else
253      if (type.floating)
254         res = LLVMBuildFAdd(builder, a, b, "");
255      else
256         res = LLVMBuildAdd(builder, a, b, "");
257
258   /* clamp to ceiling of 1.0 */
259   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
260      res = lp_build_min_simple(bld, res, bld->one);
261
262   /* XXX clamp to floor of -1 or 0??? */
263
264   return res;
265}
266
267
268/** Return the scalar sum of the elements of a */
269LLVMValueRef
270lp_build_sum_vector(struct lp_build_context *bld,
271                    LLVMValueRef a)
272{
273   LLVMBuilderRef builder = bld->gallivm->builder;
274   const struct lp_type type = bld->type;
275   LLVMValueRef index, res;
276   unsigned i;
277
278   assert(lp_check_value(type, a));
279
280   if (type.length == 1) {
281      return a;
282   }
283
284   assert(!bld->type.norm);
285
286   index = lp_build_const_int32(bld->gallivm, 0);
287   res = LLVMBuildExtractElement(builder, a, index, "");
288
289   for (i = 1; i < type.length; i++) {
290      index = lp_build_const_int32(bld->gallivm, i);
291      if (type.floating)
292         res = LLVMBuildFAdd(builder, res,
293                            LLVMBuildExtractElement(builder,
294                                                    a, index, ""),
295                            "");
296      else
297         res = LLVMBuildAdd(builder, res,
298                            LLVMBuildExtractElement(builder,
299                                                    a, index, ""),
300                            "");
301   }
302
303   return res;
304}
305
306
307/**
308 * Generate a - b
309 */
310LLVMValueRef
311lp_build_sub(struct lp_build_context *bld,
312             LLVMValueRef a,
313             LLVMValueRef b)
314{
315   LLVMBuilderRef builder = bld->gallivm->builder;
316   const struct lp_type type = bld->type;
317   LLVMValueRef res;
318
319   assert(lp_check_value(type, a));
320   assert(lp_check_value(type, b));
321
322   if(b == bld->zero)
323      return a;
324   if(a == bld->undef || b == bld->undef)
325      return bld->undef;
326   if(a == b)
327      return bld->zero;
328
329   if(bld->type.norm) {
330      const char *intrinsic = NULL;
331
332      if(b == bld->one)
333        return bld->zero;
334
335      if(util_cpu_caps.has_sse2 &&
336         type.width * type.length == 128 &&
337         !type.floating && !type.fixed) {
338         if(type.width == 8)
339            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
340         if(type.width == 16)
341            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
342      }
343
344      if(intrinsic)
345         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
346   }
347
348   if(LLVMIsConstant(a) && LLVMIsConstant(b))
349      if (type.floating)
350         res = LLVMConstFSub(a, b);
351      else
352         res = LLVMConstSub(a, b);
353   else
354      if (type.floating)
355         res = LLVMBuildFSub(builder, a, b, "");
356      else
357         res = LLVMBuildSub(builder, a, b, "");
358
359   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
360      res = lp_build_max_simple(bld, res, bld->zero);
361
362   return res;
363}
364
365
366/**
367 * Normalized 8bit multiplication.
368 *
369 * - alpha plus one
370 *
371 *     makes the following approximation to the division (Sree)
372 *
373 *       a*b/255 ~= (a*(b + 1)) >> 256
374 *
375 *     which is the fastest method that satisfies the following OpenGL criteria
376 *
377 *       0*0 = 0 and 255*255 = 255
378 *
379 * - geometric series
380 *
381 *     takes the geometric series approximation to the division
382 *
383 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
384 *
385 *     in this case just the first two terms to fit in 16bit arithmetic
386 *
387 *       t/255 ~= (t + (t >> 8)) >> 8
388 *
389 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
390 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
391 *     must be used
392 *
393 * - geometric series plus rounding
394 *
395 *     when using a geometric series division instead of truncating the result
396 *     use roundoff in the approximation (Jim Blinn)
397 *
398 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
399 *
400 *     achieving the exact results
401 *
402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
403 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
404 * @sa Michael Herf, The "double blend trick", May 2000,
405 *     http://www.stereopsis.com/doubleblend.html
406 */
407static LLVMValueRef
408lp_build_mul_u8n(struct gallivm_state *gallivm,
409                 struct lp_type i16_type,
410                 LLVMValueRef a, LLVMValueRef b)
411{
412   LLVMBuilderRef builder = gallivm->builder;
413   LLVMValueRef c8;
414   LLVMValueRef ab;
415
416   assert(!i16_type.floating);
417   assert(lp_check_value(i16_type, a));
418   assert(lp_check_value(i16_type, b));
419
420   c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
421
422#if 0
423
424   /* a*b/255 ~= (a*(b + 1)) >> 256 */
425   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
426   ab = LLVMBuildMul(builder, a, b, "");
427
428#else
429
430   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
431   ab = LLVMBuildMul(builder, a, b, "");
432   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
433   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
434
435#endif
436
437   ab = LLVMBuildLShr(builder, ab, c8, "");
438
439   return ab;
440}
441
442
443/**
444 * Generate a * b
445 */
446LLVMValueRef
447lp_build_mul(struct lp_build_context *bld,
448             LLVMValueRef a,
449             LLVMValueRef b)
450{
451   LLVMBuilderRef builder = bld->gallivm->builder;
452   const struct lp_type type = bld->type;
453   LLVMValueRef shift;
454   LLVMValueRef res;
455
456   assert(lp_check_value(type, a));
457   assert(lp_check_value(type, b));
458
459   if(a == bld->zero)
460      return bld->zero;
461   if(a == bld->one)
462      return b;
463   if(b == bld->zero)
464      return bld->zero;
465   if(b == bld->one)
466      return a;
467   if(a == bld->undef || b == bld->undef)
468      return bld->undef;
469
470   if(!type.floating && !type.fixed && type.norm) {
471      if(type.width == 8) {
472         struct lp_type i16_type = lp_wider_type(type);
473         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
474
475         lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
476         lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
477
478         /* PMULLW, PSRLW, PADDW */
479         abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
480         abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
481
482         ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
483
484         return ab;
485      }
486
487      /* FIXME */
488      assert(0);
489   }
490
491   if(type.fixed)
492      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
493   else
494      shift = NULL;
495
496   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
497      if (type.floating)
498         res = LLVMConstFMul(a, b);
499      else
500         res = LLVMConstMul(a, b);
501      if(shift) {
502         if(type.sign)
503            res = LLVMConstAShr(res, shift);
504         else
505            res = LLVMConstLShr(res, shift);
506      }
507   }
508   else {
509      if (type.floating)
510         res = LLVMBuildFMul(builder, a, b, "");
511      else
512         res = LLVMBuildMul(builder, a, b, "");
513      if(shift) {
514         if(type.sign)
515            res = LLVMBuildAShr(builder, res, shift, "");
516         else
517            res = LLVMBuildLShr(builder, res, shift, "");
518      }
519   }
520
521   return res;
522}
523
524
525/**
526 * Small vector x scale multiplication optimization.
527 */
528LLVMValueRef
529lp_build_mul_imm(struct lp_build_context *bld,
530                 LLVMValueRef a,
531                 int b)
532{
533   LLVMBuilderRef builder = bld->gallivm->builder;
534   LLVMValueRef factor;
535
536   assert(lp_check_value(bld->type, a));
537
538   if(b == 0)
539      return bld->zero;
540
541   if(b == 1)
542      return a;
543
544   if(b == -1)
545      return lp_build_negate(bld, a);
546
547   if(b == 2 && bld->type.floating)
548      return lp_build_add(bld, a, a);
549
550   if(util_is_power_of_two(b)) {
551      unsigned shift = ffs(b) - 1;
552
553      if(bld->type.floating) {
554#if 0
555         /*
556          * Power of two multiplication by directly manipulating the mantissa.
557          *
558          * XXX: This might not be always faster, it will introduce a small error
559          * for multiplication by zero, and it will produce wrong results
560          * for Inf and NaN.
561          */
562         unsigned mantissa = lp_mantissa(bld->type);
563         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
564         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
565         a = LLVMBuildAdd(builder, a, factor, "");
566         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
567         return a;
568#endif
569      }
570      else {
571         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
572         return LLVMBuildShl(builder, a, factor, "");
573      }
574   }
575
576   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
577   return lp_build_mul(bld, a, factor);
578}
579
580
581/**
582 * Generate a / b
583 */
584LLVMValueRef
585lp_build_div(struct lp_build_context *bld,
586             LLVMValueRef a,
587             LLVMValueRef b)
588{
589   LLVMBuilderRef builder = bld->gallivm->builder;
590   const struct lp_type type = bld->type;
591
592   assert(lp_check_value(type, a));
593   assert(lp_check_value(type, b));
594
595   if(a == bld->zero)
596      return bld->zero;
597   if(a == bld->one)
598      return lp_build_rcp(bld, b);
599   if(b == bld->zero)
600      return bld->undef;
601   if(b == bld->one)
602      return a;
603   if(a == bld->undef || b == bld->undef)
604      return bld->undef;
605
606   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
607      if (type.floating)
608         return LLVMConstFDiv(a, b);
609      else if (type.sign)
610         return LLVMConstSDiv(a, b);
611      else
612         return LLVMConstUDiv(a, b);
613   }
614
615   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
616      type.floating)
617      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
618
619   if (type.floating)
620      return LLVMBuildFDiv(builder, a, b, "");
621   else if (type.sign)
622      return LLVMBuildSDiv(builder, a, b, "");
623   else
624      return LLVMBuildUDiv(builder, a, b, "");
625}
626
627
628/**
629 * Linear interpolation -- without any checks.
630 *
631 * @sa http://www.stereopsis.com/doubleblend.html
632 */
633static INLINE LLVMValueRef
634lp_build_lerp_simple(struct lp_build_context *bld,
635                     LLVMValueRef x,
636                     LLVMValueRef v0,
637                     LLVMValueRef v1)
638{
639   LLVMBuilderRef builder = bld->gallivm->builder;
640   LLVMValueRef delta;
641   LLVMValueRef res;
642
643   assert(lp_check_value(bld->type, x));
644   assert(lp_check_value(bld->type, v0));
645   assert(lp_check_value(bld->type, v1));
646
647   delta = lp_build_sub(bld, v1, v0);
648
649   res = lp_build_mul(bld, x, delta);
650
651   res = lp_build_add(bld, v0, res);
652
653   if (bld->type.fixed) {
654      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
655       * but it will be wrong for other uses. Basically we need a more
656       * powerful lp_type, capable of further distinguishing the values
657       * interpretation from the value storage. */
658      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
659   }
660
661   return res;
662}
663
664
665/**
666 * Linear interpolation.
667 */
668LLVMValueRef
669lp_build_lerp(struct lp_build_context *bld,
670              LLVMValueRef x,
671              LLVMValueRef v0,
672              LLVMValueRef v1)
673{
674   LLVMBuilderRef builder = bld->gallivm->builder;
675   const struct lp_type type = bld->type;
676   LLVMValueRef res;
677
678   assert(lp_check_value(type, x));
679   assert(lp_check_value(type, v0));
680   assert(lp_check_value(type, v1));
681
682   if (type.norm) {
683      struct lp_type wide_type;
684      struct lp_build_context wide_bld;
685      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
686      LLVMValueRef shift;
687
688      assert(type.length >= 2);
689      assert(!type.sign);
690
691      /*
692       * Create a wider type, enough to hold the intermediate result of the
693       * multiplication.
694       */
695      memset(&wide_type, 0, sizeof wide_type);
696      wide_type.fixed  = TRUE;
697      wide_type.width  = type.width*2;
698      wide_type.length = type.length/2;
699
700      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
701
702      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
703      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
704      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
705
706      /*
707       * Scale x from [0, 255] to [0, 256]
708       */
709
710      shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
711
712      xl = lp_build_add(&wide_bld, xl,
713                        LLVMBuildAShr(builder, xl, shift, ""));
714      xh = lp_build_add(&wide_bld, xh,
715                        LLVMBuildAShr(builder, xh, shift, ""));
716
717      /*
718       * Lerp both halves.
719       */
720
721      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
722      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
723
724      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
725   } else {
726      res = lp_build_lerp_simple(bld, x, v0, v1);
727   }
728
729   return res;
730}
731
732
733LLVMValueRef
734lp_build_lerp_2d(struct lp_build_context *bld,
735                 LLVMValueRef x,
736                 LLVMValueRef y,
737                 LLVMValueRef v00,
738                 LLVMValueRef v01,
739                 LLVMValueRef v10,
740                 LLVMValueRef v11)
741{
742   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
743   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
744   return lp_build_lerp(bld, y, v0, v1);
745}
746
747
748/**
749 * Generate min(a, b)
750 * Do checks for special cases.
751 */
752LLVMValueRef
753lp_build_min(struct lp_build_context *bld,
754             LLVMValueRef a,
755             LLVMValueRef b)
756{
757   assert(lp_check_value(bld->type, a));
758   assert(lp_check_value(bld->type, b));
759
760   if(a == bld->undef || b == bld->undef)
761      return bld->undef;
762
763   if(a == b)
764      return a;
765
766   if(bld->type.norm) {
767      if(a == bld->zero || b == bld->zero)
768         return bld->zero;
769      if(a == bld->one)
770         return b;
771      if(b == bld->one)
772         return a;
773   }
774
775   return lp_build_min_simple(bld, a, b);
776}
777
778
779/**
780 * Generate max(a, b)
781 * Do checks for special cases.
782 */
783LLVMValueRef
784lp_build_max(struct lp_build_context *bld,
785             LLVMValueRef a,
786             LLVMValueRef b)
787{
788   assert(lp_check_value(bld->type, a));
789   assert(lp_check_value(bld->type, b));
790
791   if(a == bld->undef || b == bld->undef)
792      return bld->undef;
793
794   if(a == b)
795      return a;
796
797   if(bld->type.norm) {
798      if(a == bld->one || b == bld->one)
799         return bld->one;
800      if(a == bld->zero)
801         return b;
802      if(b == bld->zero)
803         return a;
804   }
805
806   return lp_build_max_simple(bld, a, b);
807}
808
809
810/**
811 * Generate clamp(a, min, max)
812 * Do checks for special cases.
813 */
814LLVMValueRef
815lp_build_clamp(struct lp_build_context *bld,
816               LLVMValueRef a,
817               LLVMValueRef min,
818               LLVMValueRef max)
819{
820   assert(lp_check_value(bld->type, a));
821   assert(lp_check_value(bld->type, min));
822   assert(lp_check_value(bld->type, max));
823
824   a = lp_build_min(bld, a, max);
825   a = lp_build_max(bld, a, min);
826   return a;
827}
828
829
830/**
831 * Generate abs(a)
832 */
833LLVMValueRef
834lp_build_abs(struct lp_build_context *bld,
835             LLVMValueRef a)
836{
837   LLVMBuilderRef builder = bld->gallivm->builder;
838   const struct lp_type type = bld->type;
839   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
840
841   assert(lp_check_value(type, a));
842
843   if(!type.sign)
844      return a;
845
846   if(type.floating) {
847      /* Mask out the sign bit */
848      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
849      unsigned long long absMask = ~(1ULL << (type.width - 1));
850      LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
851      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
852      a = LLVMBuildAnd(builder, a, mask, "");
853      a = LLVMBuildBitCast(builder, a, vec_type, "");
854      return a;
855   }
856
857   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
858      switch(type.width) {
859      case 8:
860         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
861      case 16:
862         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
863      case 32:
864         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
865      }
866   }
867
868   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
869}
870
871
872LLVMValueRef
873lp_build_negate(struct lp_build_context *bld,
874                LLVMValueRef a)
875{
876   LLVMBuilderRef builder = bld->gallivm->builder;
877
878   assert(lp_check_value(bld->type, a));
879
880#if HAVE_LLVM >= 0x0207
881   if (bld->type.floating)
882      a = LLVMBuildFNeg(builder, a, "");
883   else
884#endif
885      a = LLVMBuildNeg(builder, a, "");
886
887   return a;
888}
889
890
891/** Return -1, 0 or +1 depending on the sign of a */
892LLVMValueRef
893lp_build_sgn(struct lp_build_context *bld,
894             LLVMValueRef a)
895{
896   LLVMBuilderRef builder = bld->gallivm->builder;
897   const struct lp_type type = bld->type;
898   LLVMValueRef cond;
899   LLVMValueRef res;
900
901   assert(lp_check_value(type, a));
902
903   /* Handle non-zero case */
904   if(!type.sign) {
905      /* if not zero then sign must be positive */
906      res = bld->one;
907   }
908   else if(type.floating) {
909      LLVMTypeRef vec_type;
910      LLVMTypeRef int_type;
911      LLVMValueRef mask;
912      LLVMValueRef sign;
913      LLVMValueRef one;
914      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
915
916      int_type = lp_build_int_vec_type(bld->gallivm, type);
917      vec_type = lp_build_vec_type(bld->gallivm, type);
918      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
919
920      /* Take the sign bit and add it to 1 constant */
921      sign = LLVMBuildBitCast(builder, a, int_type, "");
922      sign = LLVMBuildAnd(builder, sign, mask, "");
923      one = LLVMConstBitCast(bld->one, int_type);
924      res = LLVMBuildOr(builder, sign, one, "");
925      res = LLVMBuildBitCast(builder, res, vec_type, "");
926   }
927   else
928   {
929      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
930      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
931      res = lp_build_select(bld, cond, bld->one, minus_one);
932   }
933
934   /* Handle zero */
935   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
936   res = lp_build_select(bld, cond, bld->zero, res);
937
938   return res;
939}
940
941
942/**
943 * Set the sign of float vector 'a' according to 'sign'.
944 * If sign==0, return abs(a).
945 * If sign==1, return -abs(a);
946 * Other values for sign produce undefined results.
947 */
948LLVMValueRef
949lp_build_set_sign(struct lp_build_context *bld,
950                  LLVMValueRef a, LLVMValueRef sign)
951{
952   LLVMBuilderRef builder = bld->gallivm->builder;
953   const struct lp_type type = bld->type;
954   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
955   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
956   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
957   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
958                             ~((unsigned long long) 1 << (type.width - 1)));
959   LLVMValueRef val, res;
960
961   assert(type.floating);
962   assert(lp_check_value(type, a));
963
964   /* val = reinterpret_cast<int>(a) */
965   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
966   /* val = val & mask */
967   val = LLVMBuildAnd(builder, val, mask, "");
968   /* sign = sign << shift */
969   sign = LLVMBuildShl(builder, sign, shift, "");
970   /* res = val | sign */
971   res = LLVMBuildOr(builder, val, sign, "");
972   /* res = reinterpret_cast<float>(res) */
973   res = LLVMBuildBitCast(builder, res, vec_type, "");
974
975   return res;
976}
977
978
979/**
980 * Convert vector of (or scalar) int to vector of (or scalar) float.
981 */
982LLVMValueRef
983lp_build_int_to_float(struct lp_build_context *bld,
984                      LLVMValueRef a)
985{
986   LLVMBuilderRef builder = bld->gallivm->builder;
987   const struct lp_type type = bld->type;
988   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
989
990   assert(type.floating);
991
992   return LLVMBuildSIToFP(builder, a, vec_type, "");
993}
994
995
996
997enum lp_build_round_sse41_mode
998{
999   LP_BUILD_ROUND_SSE41_NEAREST = 0,
1000   LP_BUILD_ROUND_SSE41_FLOOR = 1,
1001   LP_BUILD_ROUND_SSE41_CEIL = 2,
1002   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1003};
1004
1005
1006/**
1007 * Helper for SSE4.1's ROUNDxx instructions.
1008 *
1009 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1010 * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1011 */
1012static INLINE LLVMValueRef
1013lp_build_round_sse41(struct lp_build_context *bld,
1014                     LLVMValueRef a,
1015                     enum lp_build_round_sse41_mode mode)
1016{
1017   LLVMBuilderRef builder = bld->gallivm->builder;
1018   const struct lp_type type = bld->type;
1019   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1020   const char *intrinsic;
1021   LLVMValueRef res;
1022
1023   assert(type.floating);
1024
1025   assert(lp_check_value(type, a));
1026   assert(util_cpu_caps.has_sse4_1);
1027
1028   if (type.length == 1) {
1029      LLVMTypeRef vec_type;
1030      LLVMValueRef undef;
1031      LLVMValueRef args[3];
1032      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1033
1034      switch(type.width) {
1035      case 32:
1036         intrinsic = "llvm.x86.sse41.round.ss";
1037         break;
1038      case 64:
1039         intrinsic = "llvm.x86.sse41.round.sd";
1040         break;
1041      default:
1042         assert(0);
1043         return bld->undef;
1044      }
1045
1046      vec_type = LLVMVectorType(bld->elem_type, 4);
1047
1048      undef = LLVMGetUndef(vec_type);
1049
1050      args[0] = undef;
1051      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1052      args[2] = LLVMConstInt(i32t, mode, 0);
1053
1054      res = lp_build_intrinsic(builder, intrinsic,
1055                               vec_type, args, Elements(args));
1056
1057      res = LLVMBuildExtractElement(builder, res, index0, "");
1058   }
1059   else {
1060      assert(type.width*type.length == 128);
1061
1062      switch(type.width) {
1063      case 32:
1064         intrinsic = "llvm.x86.sse41.round.ps";
1065         break;
1066      case 64:
1067         intrinsic = "llvm.x86.sse41.round.pd";
1068         break;
1069      default:
1070         assert(0);
1071         return bld->undef;
1072      }
1073
1074      res = lp_build_intrinsic_binary(builder, intrinsic,
1075                                      bld->vec_type, a,
1076                                      LLVMConstInt(i32t, mode, 0));
1077   }
1078
1079   return res;
1080}
1081
1082
1083static INLINE LLVMValueRef
1084lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1085                             LLVMValueRef a)
1086{
1087   LLVMBuilderRef builder = bld->gallivm->builder;
1088   const struct lp_type type = bld->type;
1089   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1090   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1091   const char *intrinsic;
1092   LLVMValueRef res;
1093
1094   assert(type.floating);
1095   /* using the double precision conversions is a bit more complicated */
1096   assert(type.width == 32);
1097
1098   assert(lp_check_value(type, a));
1099   assert(util_cpu_caps.has_sse2);
1100
1101   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1102   if (type.length == 1) {
1103      LLVMTypeRef vec_type;
1104      LLVMValueRef undef;
1105      LLVMValueRef arg;
1106      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1107
1108      vec_type = LLVMVectorType(bld->elem_type, 4);
1109
1110      intrinsic = "llvm.x86.sse.cvtss2si";
1111
1112      undef = LLVMGetUndef(vec_type);
1113
1114      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1115
1116      res = lp_build_intrinsic_unary(builder, intrinsic,
1117                                     ret_type, arg);
1118   }
1119   else {
1120      assert(type.width*type.length == 128);
1121
1122      intrinsic = "llvm.x86.sse2.cvtps2dq";
1123
1124      res = lp_build_intrinsic_unary(builder, intrinsic,
1125                                     ret_type, a);
1126   }
1127
1128   return res;
1129}
1130
1131
1132/**
1133 * Return the integer part of a float (vector) value (== round toward zero).
1134 * The returned value is a float (vector).
1135 * Ex: trunc(-1.5) = -1.0
1136 */
1137LLVMValueRef
1138lp_build_trunc(struct lp_build_context *bld,
1139               LLVMValueRef a)
1140{
1141   LLVMBuilderRef builder = bld->gallivm->builder;
1142   const struct lp_type type = bld->type;
1143
1144   assert(type.floating);
1145   assert(lp_check_value(type, a));
1146
1147   if (util_cpu_caps.has_sse4_1 &&
1148       (type.length == 1 || type.width*type.length == 128)) {
1149      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1150   }
1151   else {
1152      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1153      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1154      LLVMValueRef res;
1155      res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1156      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1157      return res;
1158   }
1159}
1160
1161
1162/**
1163 * Return float (vector) rounded to nearest integer (vector).  The returned
1164 * value is a float (vector).
1165 * Ex: round(0.9) = 1.0
1166 * Ex: round(-1.5) = -2.0
1167 */
1168LLVMValueRef
1169lp_build_round(struct lp_build_context *bld,
1170               LLVMValueRef a)
1171{
1172   LLVMBuilderRef builder = bld->gallivm->builder;
1173   const struct lp_type type = bld->type;
1174
1175   assert(type.floating);
1176   assert(lp_check_value(type, a));
1177
1178   if (util_cpu_caps.has_sse4_1 &&
1179       (type.length == 1 || type.width*type.length == 128)) {
1180      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1181   }
1182   else {
1183      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1184      LLVMValueRef res;
1185      res = lp_build_iround(bld, a);
1186      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1187      return res;
1188   }
1189}
1190
1191
1192/**
1193 * Return floor of float (vector), result is a float (vector)
1194 * Ex: floor(1.1) = 1.0
1195 * Ex: floor(-1.1) = -2.0
1196 */
1197LLVMValueRef
1198lp_build_floor(struct lp_build_context *bld,
1199               LLVMValueRef a)
1200{
1201   LLVMBuilderRef builder = bld->gallivm->builder;
1202   const struct lp_type type = bld->type;
1203
1204   assert(type.floating);
1205   assert(lp_check_value(type, a));
1206
1207   if (util_cpu_caps.has_sse4_1 &&
1208       (type.length == 1 || type.width*type.length == 128)) {
1209      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1210   }
1211   else {
1212      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1213      LLVMValueRef res;
1214      res = lp_build_ifloor(bld, a);
1215      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1216      return res;
1217   }
1218}
1219
1220
1221/**
1222 * Return ceiling of float (vector), returning float (vector).
1223 * Ex: ceil( 1.1) = 2.0
1224 * Ex: ceil(-1.1) = -1.0
1225 */
1226LLVMValueRef
1227lp_build_ceil(struct lp_build_context *bld,
1228              LLVMValueRef a)
1229{
1230   LLVMBuilderRef builder = bld->gallivm->builder;
1231   const struct lp_type type = bld->type;
1232
1233   assert(type.floating);
1234   assert(lp_check_value(type, a));
1235
1236   if (util_cpu_caps.has_sse4_1 &&
1237       (type.length == 1 || type.width*type.length == 128)) {
1238      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1239   }
1240   else {
1241      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1242      LLVMValueRef res;
1243      res = lp_build_iceil(bld, a);
1244      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1245      return res;
1246   }
1247}
1248
1249
1250/**
1251 * Return fractional part of 'a' computed as a - floor(a)
1252 * Typically used in texture coord arithmetic.
1253 */
1254LLVMValueRef
1255lp_build_fract(struct lp_build_context *bld,
1256               LLVMValueRef a)
1257{
1258   assert(bld->type.floating);
1259   return lp_build_sub(bld, a, lp_build_floor(bld, a));
1260}
1261
1262
1263/**
1264 * Return the integer part of a float (vector) value (== round toward zero).
1265 * The returned value is an integer (vector).
1266 * Ex: itrunc(-1.5) = -1
1267 */
1268LLVMValueRef
1269lp_build_itrunc(struct lp_build_context *bld,
1270                LLVMValueRef a)
1271{
1272   LLVMBuilderRef builder = bld->gallivm->builder;
1273   const struct lp_type type = bld->type;
1274   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1275
1276   assert(type.floating);
1277   assert(lp_check_value(type, a));
1278
1279   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1280}
1281
1282
1283/**
1284 * Return float (vector) rounded to nearest integer (vector).  The returned
1285 * value is an integer (vector).
1286 * Ex: iround(0.9) = 1
1287 * Ex: iround(-1.5) = -2
1288 */
1289LLVMValueRef
1290lp_build_iround(struct lp_build_context *bld,
1291                LLVMValueRef a)
1292{
1293   LLVMBuilderRef builder = bld->gallivm->builder;
1294   const struct lp_type type = bld->type;
1295   LLVMTypeRef int_vec_type = bld->int_vec_type;
1296   LLVMValueRef res;
1297
1298   assert(type.floating);
1299
1300   assert(lp_check_value(type, a));
1301
1302   if (util_cpu_caps.has_sse2 &&
1303       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1304      return lp_build_iround_nearest_sse2(bld, a);
1305   }
1306   else if (util_cpu_caps.has_sse4_1 &&
1307       (type.length == 1 || type.width*type.length == 128)) {
1308      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1309   }
1310   else {
1311      LLVMValueRef half;
1312
1313      half = lp_build_const_vec(bld->gallivm, type, 0.5);
1314
1315      if (type.sign) {
1316         LLVMTypeRef vec_type = bld->vec_type;
1317         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1318                                    (unsigned long long)1 << (type.width - 1));
1319         LLVMValueRef sign;
1320
1321         /* get sign bit */
1322         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1323         sign = LLVMBuildAnd(builder, sign, mask, "");
1324
1325         /* sign * 0.5 */
1326         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1327         half = LLVMBuildOr(builder, sign, half, "");
1328         half = LLVMBuildBitCast(builder, half, vec_type, "");
1329      }
1330
1331      res = LLVMBuildFAdd(builder, a, half, "");
1332   }
1333
1334   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1335
1336   return res;
1337}
1338
1339
1340/**
1341 * Return floor of float (vector), result is an int (vector)
1342 * Ex: ifloor(1.1) = 1.0
1343 * Ex: ifloor(-1.1) = -2.0
1344 */
1345LLVMValueRef
1346lp_build_ifloor(struct lp_build_context *bld,
1347                LLVMValueRef a)
1348{
1349   LLVMBuilderRef builder = bld->gallivm->builder;
1350   const struct lp_type type = bld->type;
1351   LLVMTypeRef int_vec_type = bld->int_vec_type;
1352   LLVMValueRef res;
1353
1354   assert(type.floating);
1355   assert(lp_check_value(type, a));
1356
1357   if (util_cpu_caps.has_sse4_1 &&
1358       (type.length == 1 || type.width*type.length == 128)) {
1359      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1360   }
1361   else {
1362      res = a;
1363
1364      if (type.sign) {
1365         /* Take the sign bit and add it to 1 constant */
1366         LLVMTypeRef vec_type = bld->vec_type;
1367         unsigned mantissa = lp_mantissa(type);
1368         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1369                                  (unsigned long long)1 << (type.width - 1));
1370         LLVMValueRef sign;
1371         LLVMValueRef offset;
1372
1373         /* sign = a < 0 ? ~0 : 0 */
1374         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1375         sign = LLVMBuildAnd(builder, sign, mask, "");
1376         sign = LLVMBuildAShr(builder, sign,
1377                              lp_build_const_int_vec(bld->gallivm, type,
1378                                                     type.width - 1),
1379                              "ifloor.sign");
1380
1381         /* offset = -0.99999(9)f */
1382         offset = lp_build_const_vec(bld->gallivm, type,
1383                                     -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1384         offset = LLVMConstBitCast(offset, int_vec_type);
1385
1386         /* offset = a < 0 ? offset : 0.0f */
1387         offset = LLVMBuildAnd(builder, offset, sign, "");
1388         offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1389
1390         res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1391      }
1392   }
1393
1394   /* round to nearest (toward zero) */
1395   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1396
1397   return res;
1398}
1399
1400
1401/**
1402 * Return ceiling of float (vector), returning int (vector).
1403 * Ex: iceil( 1.1) = 2
1404 * Ex: iceil(-1.1) = -1
1405 */
1406LLVMValueRef
1407lp_build_iceil(struct lp_build_context *bld,
1408               LLVMValueRef a)
1409{
1410   LLVMBuilderRef builder = bld->gallivm->builder;
1411   const struct lp_type type = bld->type;
1412   LLVMTypeRef int_vec_type = bld->int_vec_type;
1413   LLVMValueRef res;
1414
1415   assert(type.floating);
1416   assert(lp_check_value(type, a));
1417
1418   if (util_cpu_caps.has_sse4_1 &&
1419       (type.length == 1 || type.width*type.length == 128)) {
1420      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1421   }
1422   else {
1423      LLVMTypeRef vec_type = bld->vec_type;
1424      unsigned mantissa = lp_mantissa(type);
1425      LLVMValueRef offset;
1426
1427      /* offset = 0.99999(9)f */
1428      offset = lp_build_const_vec(bld->gallivm, type,
1429                                  (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1430
1431      if (type.sign) {
1432         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1433                                (unsigned long long)1 << (type.width - 1));
1434         LLVMValueRef sign;
1435
1436         /* sign = a < 0 ? 0 : ~0 */
1437         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1438         sign = LLVMBuildAnd(builder, sign, mask, "");
1439         sign = LLVMBuildAShr(builder, sign,
1440                              lp_build_const_int_vec(bld->gallivm, type,
1441                                                     type.width - 1),
1442                              "iceil.sign");
1443         sign = LLVMBuildNot(builder, sign, "iceil.not");
1444
1445         /* offset = a < 0 ? 0.0 : offset */
1446         offset = LLVMConstBitCast(offset, int_vec_type);
1447         offset = LLVMBuildAnd(builder, offset, sign, "");
1448         offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1449      }
1450
1451      res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1452   }
1453
1454   /* round to nearest (toward zero) */
1455   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1456
1457   return res;
1458}
1459
1460
1461/**
1462 * Combined ifloor() & fract().
1463 *
1464 * Preferred to calling the functions separately, as it will ensure that the
1465 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1466 */
1467void
1468lp_build_ifloor_fract(struct lp_build_context *bld,
1469                      LLVMValueRef a,
1470                      LLVMValueRef *out_ipart,
1471                      LLVMValueRef *out_fpart)
1472{
1473   LLVMBuilderRef builder = bld->gallivm->builder;
1474   const struct lp_type type = bld->type;
1475   LLVMValueRef ipart;
1476
1477   assert(type.floating);
1478   assert(lp_check_value(type, a));
1479
1480   if (util_cpu_caps.has_sse4_1 &&
1481       (type.length == 1 || type.width*type.length == 128)) {
1482      /*
1483       * floor() is easier.
1484       */
1485
1486      ipart = lp_build_floor(bld, a);
1487      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1488      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1489   }
1490   else {
1491      /*
1492       * ifloor() is easier.
1493       */
1494
1495      *out_ipart = lp_build_ifloor(bld, a);
1496      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1497      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1498   }
1499}
1500
1501
1502LLVMValueRef
1503lp_build_sqrt(struct lp_build_context *bld,
1504              LLVMValueRef a)
1505{
1506   LLVMBuilderRef builder = bld->gallivm->builder;
1507   const struct lp_type type = bld->type;
1508   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1509   char intrinsic[32];
1510
1511   assert(lp_check_value(type, a));
1512
1513   /* TODO: optimize the constant case */
1514   /* TODO: optimize the constant case */
1515
1516   assert(type.floating);
1517   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1518
1519   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1520}
1521
1522
1523/**
1524 * Do one Newton-Raphson step to improve reciprocate precision:
1525 *
1526 *   x_{i+1} = x_i * (2 - a * x_i)
1527 *
1528 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1529 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1530 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1531 * halo. It would be necessary to clamp the argument to prevent this.
1532 *
1533 * See also:
1534 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1535 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1536 */
1537static INLINE LLVMValueRef
1538lp_build_rcp_refine(struct lp_build_context *bld,
1539                    LLVMValueRef a,
1540                    LLVMValueRef rcp_a)
1541{
1542   LLVMBuilderRef builder = bld->gallivm->builder;
1543   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1544   LLVMValueRef res;
1545
1546   res = LLVMBuildFMul(builder, a, rcp_a, "");
1547   res = LLVMBuildFSub(builder, two, res, "");
1548   res = LLVMBuildFMul(builder, rcp_a, res, "");
1549
1550   return res;
1551}
1552
1553
1554LLVMValueRef
1555lp_build_rcp(struct lp_build_context *bld,
1556             LLVMValueRef a)
1557{
1558   LLVMBuilderRef builder = bld->gallivm->builder;
1559   const struct lp_type type = bld->type;
1560
1561   assert(lp_check_value(type, a));
1562
1563   if(a == bld->zero)
1564      return bld->undef;
1565   if(a == bld->one)
1566      return bld->one;
1567   if(a == bld->undef)
1568      return bld->undef;
1569
1570   assert(type.floating);
1571
1572   if(LLVMIsConstant(a))
1573      return LLVMConstFDiv(bld->one, a);
1574
1575   /*
1576    * We don't use RCPPS because:
1577    * - it only has 10bits of precision
1578    * - it doesn't even get the reciprocate of 1.0 exactly
1579    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1580    * - for recent processors the benefit over DIVPS is marginal, a case
1581    *   depedent
1582    *
1583    * We could still use it on certain processors if benchmarks show that the
1584    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1585    * particular uses that require less workarounds.
1586    */
1587
1588   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1589      const unsigned num_iterations = 0;
1590      LLVMValueRef res;
1591      unsigned i;
1592
1593      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1594
1595      for (i = 0; i < num_iterations; ++i) {
1596         res = lp_build_rcp_refine(bld, a, res);
1597      }
1598
1599      return res;
1600   }
1601
1602   return LLVMBuildFDiv(builder, bld->one, a, "");
1603}
1604
1605
1606/**
1607 * Do one Newton-Raphson step to improve rsqrt precision:
1608 *
1609 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1610 *
1611 * See also:
1612 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1613 */
1614static INLINE LLVMValueRef
1615lp_build_rsqrt_refine(struct lp_build_context *bld,
1616                      LLVMValueRef a,
1617                      LLVMValueRef rsqrt_a)
1618{
1619   LLVMBuilderRef builder = bld->gallivm->builder;
1620   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1621   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1622   LLVMValueRef res;
1623
1624   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1625   res = LLVMBuildFMul(builder, a, res, "");
1626   res = LLVMBuildFSub(builder, three, res, "");
1627   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1628   res = LLVMBuildFMul(builder, half, res, "");
1629
1630   return res;
1631}
1632
1633
1634/**
1635 * Generate 1/sqrt(a)
1636 */
1637LLVMValueRef
1638lp_build_rsqrt(struct lp_build_context *bld,
1639               LLVMValueRef a)
1640{
1641   LLVMBuilderRef builder = bld->gallivm->builder;
1642   const struct lp_type type = bld->type;
1643
1644   assert(lp_check_value(type, a));
1645
1646   assert(type.floating);
1647
1648   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1649      const unsigned num_iterations = 1;
1650      LLVMValueRef res;
1651      unsigned i;
1652
1653      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1654
1655      for (i = 0; i < num_iterations; ++i) {
1656         res = lp_build_rsqrt_refine(bld, a, res);
1657      }
1658
1659      return res;
1660   }
1661
1662   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1663}
1664
1665
1666/**
1667 * Generate sin(a) using SSE2
1668 */
1669LLVMValueRef
1670lp_build_sin(struct lp_build_context *bld,
1671             LLVMValueRef a)
1672{
1673   struct gallivm_state *gallivm = bld->gallivm;
1674   LLVMBuilderRef builder = gallivm->builder;
1675   struct lp_type int_type = lp_int_type(bld->type);
1676   LLVMBuilderRef b = builder;
1677
1678   /*
1679    *  take the absolute value,
1680    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1681    */
1682
1683   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1684   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1685
1686   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1687   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1688
1689   /*
1690    * extract the sign bit (upper one)
1691    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1692    */
1693   LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1694   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1695
1696   /*
1697    * scale by 4/Pi
1698    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1699    */
1700
1701   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1702   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1703
1704   /*
1705    * store the integer part of y in mm0
1706    * emm2 = _mm_cvttps_epi32(y);
1707    */
1708
1709   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1710
1711   /*
1712    * j=(j+1) & (~1) (see the cephes sources)
1713    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1714    */
1715
1716   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1717   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1718   /*
1719    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1720    */
1721   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1722   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1723
1724   /*
1725    * y = _mm_cvtepi32_ps(emm2);
1726    */
1727   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1728
1729   /* get the swap sign flag
1730    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1731    */
1732   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1733   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1734
1735   /*
1736    * emm2 = _mm_slli_epi32(emm0, 29);
1737    */
1738   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1739   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1740
1741   /*
1742    * get the polynom selection mask
1743    * there is one polynom for 0 <= x <= Pi/4
1744    * and another one for Pi/4<x<=Pi/2
1745    * Both branches will be computed.
1746    *
1747    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1748    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1749    */
1750
1751   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1752   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1753   LLVMValueRef poly_mask = lp_build_compare(gallivm,
1754                                             int_type, PIPE_FUNC_EQUAL,
1755                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1756   /*
1757    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1758    */
1759   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1760
1761   /*
1762    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1763    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1764    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1765    */
1766   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1767   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1768   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1769
1770   /*
1771    * The magic pass: "Extended precision modular arithmetic"
1772    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1773    * xmm1 = _mm_mul_ps(y, xmm1);
1774    * xmm2 = _mm_mul_ps(y, xmm2);
1775    * xmm3 = _mm_mul_ps(y, xmm3);
1776    */
1777   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1778   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1779   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1780
1781   /*
1782    * x = _mm_add_ps(x, xmm1);
1783    * x = _mm_add_ps(x, xmm2);
1784    * x = _mm_add_ps(x, xmm3);
1785    */
1786
1787   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1788   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1789   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1790
1791   /*
1792    * Evaluate the first polynom  (0 <= x <= Pi/4)
1793    *
1794    * z = _mm_mul_ps(x,x);
1795    */
1796   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1797
1798   /*
1799    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1800    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1801    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1802    */
1803   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1804   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1805   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1806
1807   /*
1808    * y = *(v4sf*)_ps_coscof_p0;
1809    * y = _mm_mul_ps(y, z);
1810    */
1811   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1812   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1813   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1814   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1815   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1816   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1817
1818
1819   /*
1820    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1821    * y = _mm_sub_ps(y, tmp);
1822    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1823    */
1824   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1825   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1826   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1827   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1828   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1829
1830   /*
1831    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1832    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1833    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1834    */
1835   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1836   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1837   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1838
1839   /*
1840    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1841    *
1842    * y2 = *(v4sf*)_ps_sincof_p0;
1843    * y2 = _mm_mul_ps(y2, z);
1844    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1845    * y2 = _mm_mul_ps(y2, z);
1846    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1847    * y2 = _mm_mul_ps(y2, z);
1848    * y2 = _mm_mul_ps(y2, x);
1849    * y2 = _mm_add_ps(y2, x);
1850    */
1851
1852   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1853   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1854   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1855   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1856   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1857   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1858   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1859
1860   /*
1861    * select the correct result from the two polynoms
1862    * xmm3 = poly_mask;
1863    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1864    * y = _mm_andnot_ps(xmm3, y);
1865    * y = _mm_add_ps(y,y2);
1866    */
1867   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1868   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1869   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1870   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1871   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1872   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1873   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1874
1875   /*
1876    * update the sign
1877    * y = _mm_xor_ps(y, sign_bit);
1878    */
1879   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1880   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1881   return y_result;
1882}
1883
1884
1885/**
1886 * Generate cos(a) using SSE2
1887 */
1888LLVMValueRef
1889lp_build_cos(struct lp_build_context *bld,
1890             LLVMValueRef a)
1891{
1892   struct gallivm_state *gallivm = bld->gallivm;
1893   LLVMBuilderRef builder = gallivm->builder;
1894   struct lp_type int_type = lp_int_type(bld->type);
1895   LLVMBuilderRef b = builder;
1896
1897   /*
1898    *  take the absolute value,
1899    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1900    */
1901
1902   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1903   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1904
1905   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1906   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1907
1908   /*
1909    * scale by 4/Pi
1910    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1911    */
1912
1913   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1914   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1915
1916   /*
1917    * store the integer part of y in mm0
1918    * emm2 = _mm_cvttps_epi32(y);
1919    */
1920
1921   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1922
1923   /*
1924    * j=(j+1) & (~1) (see the cephes sources)
1925    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1926    */
1927
1928   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1929   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1930   /*
1931    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1932    */
1933   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1934   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1935
1936   /*
1937    * y = _mm_cvtepi32_ps(emm2);
1938    */
1939   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1940
1941
1942   /*
1943    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1944    */
1945   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1946   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1947
1948
1949   /* get the swap sign flag
1950    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1951    */
1952   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1953   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1954   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1955   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1956
1957   /*
1958    * emm2 = _mm_slli_epi32(emm0, 29);
1959    */
1960   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1961   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1962
1963   /*
1964    * get the polynom selection mask
1965    * there is one polynom for 0 <= x <= Pi/4
1966    * and another one for Pi/4<x<=Pi/2
1967    * Both branches will be computed.
1968    *
1969    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1970    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1971    */
1972
1973   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1974   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1975   LLVMValueRef poly_mask = lp_build_compare(gallivm,
1976                                             int_type, PIPE_FUNC_EQUAL,
1977   				             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1978
1979   /*
1980    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1981    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1982    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1983    */
1984   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1985   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1986   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1987
1988   /*
1989    * The magic pass: "Extended precision modular arithmetic"
1990    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1991    * xmm1 = _mm_mul_ps(y, xmm1);
1992    * xmm2 = _mm_mul_ps(y, xmm2);
1993    * xmm3 = _mm_mul_ps(y, xmm3);
1994    */
1995   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1996   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1997   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1998
1999   /*
2000    * x = _mm_add_ps(x, xmm1);
2001    * x = _mm_add_ps(x, xmm2);
2002    * x = _mm_add_ps(x, xmm3);
2003    */
2004
2005   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2006   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2007   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2008
2009   /*
2010    * Evaluate the first polynom  (0 <= x <= Pi/4)
2011    *
2012    * z = _mm_mul_ps(x,x);
2013    */
2014   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2015
2016   /*
2017    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2018    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2019    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2020    */
2021   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2022   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2023   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2024
2025   /*
2026    * y = *(v4sf*)_ps_coscof_p0;
2027    * y = _mm_mul_ps(y, z);
2028    */
2029   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2030   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2031   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2032   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2033   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2034   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2035
2036
2037   /*
2038    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2039    * y = _mm_sub_ps(y, tmp);
2040    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2041    */
2042   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2043   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2044   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2045   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2046   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2047
2048   /*
2049    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2050    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2051    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2052    */
2053   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2054   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2055   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2056
2057   /*
2058    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2059    *
2060    * y2 = *(v4sf*)_ps_sincof_p0;
2061    * y2 = _mm_mul_ps(y2, z);
2062    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2063    * y2 = _mm_mul_ps(y2, z);
2064    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2065    * y2 = _mm_mul_ps(y2, z);
2066    * y2 = _mm_mul_ps(y2, x);
2067    * y2 = _mm_add_ps(y2, x);
2068    */
2069
2070   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2071   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2072   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2073   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2074   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2075   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2076   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2077
2078   /*
2079    * select the correct result from the two polynoms
2080    * xmm3 = poly_mask;
2081    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2082    * y = _mm_andnot_ps(xmm3, y);
2083    * y = _mm_add_ps(y,y2);
2084    */
2085   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2086   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2087   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2088   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2089   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2090   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2091
2092   /*
2093    * update the sign
2094    * y = _mm_xor_ps(y, sign_bit);
2095    */
2096   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2097   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2098   return y_result;
2099}
2100
2101
2102/**
2103 * Generate pow(x, y)
2104 */
2105LLVMValueRef
2106lp_build_pow(struct lp_build_context *bld,
2107             LLVMValueRef x,
2108             LLVMValueRef y)
2109{
2110   /* TODO: optimize the constant case */
2111   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2112       LLVMIsConstant(x) && LLVMIsConstant(y)) {
2113      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2114                   __FUNCTION__);
2115   }
2116
2117   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2118}
2119
2120
2121/**
2122 * Generate exp(x)
2123 */
2124LLVMValueRef
2125lp_build_exp(struct lp_build_context *bld,
2126             LLVMValueRef x)
2127{
2128   /* log2(e) = 1/log(2) */
2129   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2130                                           1.4426950408889634);
2131
2132   assert(lp_check_value(bld->type, x));
2133
2134   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2135}
2136
2137
2138/**
2139 * Generate log(x)
2140 */
2141LLVMValueRef
2142lp_build_log(struct lp_build_context *bld,
2143             LLVMValueRef x)
2144{
2145   /* log(2) */
2146   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2147                                          0.69314718055994529);
2148
2149   assert(lp_check_value(bld->type, x));
2150
2151   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2152}
2153
2154
2155/**
2156 * Generate polynomial.
2157 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2158 */
2159static LLVMValueRef
2160lp_build_polynomial(struct lp_build_context *bld,
2161                    LLVMValueRef x,
2162                    const double *coeffs,
2163                    unsigned num_coeffs)
2164{
2165   const struct lp_type type = bld->type;
2166   LLVMValueRef res = NULL;
2167   unsigned i;
2168
2169   assert(lp_check_value(bld->type, x));
2170
2171   /* TODO: optimize the constant case */
2172   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2173       LLVMIsConstant(x)) {
2174      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2175                   __FUNCTION__);
2176   }
2177
2178   for (i = num_coeffs; i--; ) {
2179      LLVMValueRef coeff;
2180
2181      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2182
2183      if(res)
2184         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2185      else
2186         res = coeff;
2187   }
2188
2189   if(res)
2190      return res;
2191   else
2192      return bld->undef;
2193}
2194
2195
2196/**
2197 * Minimax polynomial fit of 2**x, in range [0, 1[
2198 */
2199const double lp_build_exp2_polynomial[] = {
2200#if EXP_POLY_DEGREE == 5
2201   0.999999925063526176901,
2202   0.693153073200168932794,
2203   0.240153617044375388211,
2204   0.0558263180532956664775,
2205   0.00898934009049466391101,
2206   0.00187757667519147912699
2207#elif EXP_POLY_DEGREE == 4
2208   1.00000259337069434683,
2209   0.693003834469974940458,
2210   0.24144275689150793076,
2211   0.0520114606103070150235,
2212   0.0135341679161270268764
2213#elif EXP_POLY_DEGREE == 3
2214   0.999925218562710312959,
2215   0.695833540494823811697,
2216   0.226067155427249155588,
2217   0.0780245226406372992967
2218#elif EXP_POLY_DEGREE == 2
2219   1.00172476321474503578,
2220   0.657636275736077639316,
2221   0.33718943461968720704
2222#else
2223#error
2224#endif
2225};
2226
2227
2228void
2229lp_build_exp2_approx(struct lp_build_context *bld,
2230                     LLVMValueRef x,
2231                     LLVMValueRef *p_exp2_int_part,
2232                     LLVMValueRef *p_frac_part,
2233                     LLVMValueRef *p_exp2)
2234{
2235   LLVMBuilderRef builder = bld->gallivm->builder;
2236   const struct lp_type type = bld->type;
2237   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2238   LLVMValueRef ipart = NULL;
2239   LLVMValueRef fpart = NULL;
2240   LLVMValueRef expipart = NULL;
2241   LLVMValueRef expfpart = NULL;
2242   LLVMValueRef res = NULL;
2243
2244   assert(lp_check_value(bld->type, x));
2245
2246   if(p_exp2_int_part || p_frac_part || p_exp2) {
2247      /* TODO: optimize the constant case */
2248      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2249          LLVMIsConstant(x)) {
2250         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2251                      __FUNCTION__);
2252      }
2253
2254      assert(type.floating && type.width == 32);
2255
2256      x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2257      x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2258
2259      /* ipart = floor(x) */
2260      /* fpart = x - ipart */
2261      lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2262   }
2263
2264   if(p_exp2_int_part || p_exp2) {
2265      /* expipart = (float) (1 << ipart) */
2266      expipart = LLVMBuildAdd(builder, ipart,
2267                              lp_build_const_int_vec(bld->gallivm, type, 127), "");
2268      expipart = LLVMBuildShl(builder, expipart,
2269                              lp_build_const_int_vec(bld->gallivm, type, 23), "");
2270      expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2271   }
2272
2273   if(p_exp2) {
2274      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2275                                     Elements(lp_build_exp2_polynomial));
2276
2277      res = LLVMBuildFMul(builder, expipart, expfpart, "");
2278   }
2279
2280   if(p_exp2_int_part)
2281      *p_exp2_int_part = expipart;
2282
2283   if(p_frac_part)
2284      *p_frac_part = fpart;
2285
2286   if(p_exp2)
2287      *p_exp2 = res;
2288}
2289
2290
2291LLVMValueRef
2292lp_build_exp2(struct lp_build_context *bld,
2293              LLVMValueRef x)
2294{
2295   LLVMValueRef res;
2296   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2297   return res;
2298}
2299
2300
2301/**
2302 * Extract the exponent of a IEEE-754 floating point value.
2303 *
2304 * Optionally apply an integer bias.
2305 *
2306 * Result is an integer value with
2307 *
2308 *   ifloor(log2(x)) + bias
2309 */
2310LLVMValueRef
2311lp_build_extract_exponent(struct lp_build_context *bld,
2312                          LLVMValueRef x,
2313                          int bias)
2314{
2315   LLVMBuilderRef builder = bld->gallivm->builder;
2316   const struct lp_type type = bld->type;
2317   unsigned mantissa = lp_mantissa(type);
2318   LLVMValueRef res;
2319
2320   assert(type.floating);
2321
2322   assert(lp_check_value(bld->type, x));
2323
2324   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2325
2326   res = LLVMBuildLShr(builder, x,
2327                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2328   res = LLVMBuildAnd(builder, res,
2329                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
2330   res = LLVMBuildSub(builder, res,
2331                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2332
2333   return res;
2334}
2335
2336
2337/**
2338 * Extract the mantissa of the a floating.
2339 *
2340 * Result is a floating point value with
2341 *
2342 *   x / floor(log2(x))
2343 */
2344LLVMValueRef
2345lp_build_extract_mantissa(struct lp_build_context *bld,
2346                          LLVMValueRef x)
2347{
2348   LLVMBuilderRef builder = bld->gallivm->builder;
2349   const struct lp_type type = bld->type;
2350   unsigned mantissa = lp_mantissa(type);
2351   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2352                                                  (1ULL << mantissa) - 1);
2353   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2354   LLVMValueRef res;
2355
2356   assert(lp_check_value(bld->type, x));
2357
2358   assert(type.floating);
2359
2360   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2361
2362   /* res = x / 2**ipart */
2363   res = LLVMBuildAnd(builder, x, mantmask, "");
2364   res = LLVMBuildOr(builder, res, one, "");
2365   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2366
2367   return res;
2368}
2369
2370
2371
2372/**
2373 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2374 * These coefficients can be generate with
2375 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2376 */
2377const double lp_build_log2_polynomial[] = {
2378#if LOG_POLY_DEGREE == 6
2379   3.11578814719469302614,
2380   -3.32419399085241980044,
2381   2.59883907202499966007,
2382   -1.23152682416275988241,
2383   0.318212422185251071475,
2384   -0.0344359067839062357313
2385#elif LOG_POLY_DEGREE == 5
2386   2.8882704548164776201,
2387   -2.52074962577807006663,
2388   1.48116647521213171641,
2389   -0.465725644288844778798,
2390   0.0596515482674574969533
2391#elif LOG_POLY_DEGREE == 4
2392   2.61761038894603480148,
2393   -1.75647175389045657003,
2394   0.688243882994381274313,
2395   -0.107254423828329604454
2396#elif LOG_POLY_DEGREE == 3
2397   2.28330284476918490682,
2398   -1.04913055217340124191,
2399   0.204446009836232697516
2400#else
2401#error
2402#endif
2403};
2404
2405
2406/**
2407 * See http://www.devmaster.net/forums/showthread.php?p=43580
2408 */
2409void
2410lp_build_log2_approx(struct lp_build_context *bld,
2411                     LLVMValueRef x,
2412                     LLVMValueRef *p_exp,
2413                     LLVMValueRef *p_floor_log2,
2414                     LLVMValueRef *p_log2)
2415{
2416   LLVMBuilderRef builder = bld->gallivm->builder;
2417   const struct lp_type type = bld->type;
2418   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2419   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2420
2421   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2422   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2423   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2424
2425   LLVMValueRef i = NULL;
2426   LLVMValueRef exp = NULL;
2427   LLVMValueRef mant = NULL;
2428   LLVMValueRef logexp = NULL;
2429   LLVMValueRef logmant = NULL;
2430   LLVMValueRef res = NULL;
2431
2432   assert(lp_check_value(bld->type, x));
2433
2434   if(p_exp || p_floor_log2 || p_log2) {
2435      /* TODO: optimize the constant case */
2436      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2437          LLVMIsConstant(x)) {
2438         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2439                      __FUNCTION__);
2440      }
2441
2442      assert(type.floating && type.width == 32);
2443
2444      /*
2445       * We don't explicitly handle denormalized numbers. They will yield a
2446       * result in the neighbourhood of -127, which appears to be adequate
2447       * enough.
2448       */
2449
2450      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2451
2452      /* exp = (float) exponent(x) */
2453      exp = LLVMBuildAnd(builder, i, expmask, "");
2454   }
2455
2456   if(p_floor_log2 || p_log2) {
2457      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2458      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2459      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2460   }
2461
2462   if(p_log2) {
2463      /* mant = (float) mantissa(x) */
2464      mant = LLVMBuildAnd(builder, i, mantmask, "");
2465      mant = LLVMBuildOr(builder, mant, one, "");
2466      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2467
2468      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2469                                    Elements(lp_build_log2_polynomial));
2470
2471      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2472      logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2473
2474      res = LLVMBuildFAdd(builder, logmant, logexp, "");
2475   }
2476
2477   if(p_exp) {
2478      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2479      *p_exp = exp;
2480   }
2481
2482   if(p_floor_log2)
2483      *p_floor_log2 = logexp;
2484
2485   if(p_log2)
2486      *p_log2 = res;
2487}
2488
2489
2490LLVMValueRef
2491lp_build_log2(struct lp_build_context *bld,
2492              LLVMValueRef x)
2493{
2494   LLVMValueRef res;
2495   lp_build_log2_approx(bld, x, NULL, NULL, &res);
2496   return res;
2497}
2498
2499
2500/**
2501 * Faster (and less accurate) log2.
2502 *
2503 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2504 *
2505 * Piece-wise linear approximation, with exact results when x is a
2506 * power of two.
2507 *
2508 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2509 */
2510LLVMValueRef
2511lp_build_fast_log2(struct lp_build_context *bld,
2512                   LLVMValueRef x)
2513{
2514   LLVMBuilderRef builder = bld->gallivm->builder;
2515   LLVMValueRef ipart;
2516   LLVMValueRef fpart;
2517
2518   assert(lp_check_value(bld->type, x));
2519
2520   assert(bld->type.floating);
2521
2522   /* ipart = floor(log2(x)) - 1 */
2523   ipart = lp_build_extract_exponent(bld, x, -1);
2524   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2525
2526   /* fpart = x / 2**ipart */
2527   fpart = lp_build_extract_mantissa(bld, x);
2528
2529   /* ipart + fpart */
2530   return LLVMBuildFAdd(builder, ipart, fpart, "");
2531}
2532
2533
2534/**
2535 * Fast implementation of iround(log2(x)).
2536 *
2537 * Not an approximation -- it should give accurate results all the time.
2538 */
2539LLVMValueRef
2540lp_build_ilog2(struct lp_build_context *bld,
2541               LLVMValueRef x)
2542{
2543   LLVMBuilderRef builder = bld->gallivm->builder;
2544   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2545   LLVMValueRef ipart;
2546
2547   assert(bld->type.floating);
2548
2549   assert(lp_check_value(bld->type, x));
2550
2551   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2552   x = LLVMBuildFMul(builder, x, sqrt2, "");
2553
2554   /* ipart = floor(log2(x) + 0.5)  */
2555   ipart = lp_build_extract_exponent(bld, x, 0);
2556
2557   return ipart;
2558}
2559
2560LLVMValueRef
2561lp_build_mod(struct lp_build_context *bld,
2562             LLVMValueRef x,
2563             LLVMValueRef y)
2564{
2565   LLVMBuilderRef builder = bld->gallivm->builder;
2566   LLVMValueRef res;
2567   const struct lp_type type = bld->type;
2568
2569   assert(lp_check_value(type, x));
2570   assert(lp_check_value(type, y));
2571
2572   if (type.floating)
2573      res = LLVMBuildFRem(builder, x, y, "");
2574   else if (type.sign)
2575      res = LLVMBuildSRem(builder, x, y, "");
2576   else
2577      res = LLVMBuildURem(builder, x, y, "");
2578   return res;
2579}
2580