lp_bld_arit.c revision a3d4af0c00a198bba00c0965131a14223d426dd4
1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include "util/u_memory.h"
49#include "util/u_debug.h"
50#include "util/u_math.h"
51#include "util/u_string.h"
52#include "util/u_cpu_detect.h"
53
54#include "lp_bld_type.h"
55#include "lp_bld_const.h"
56#include "lp_bld_init.h"
57#include "lp_bld_intr.h"
58#include "lp_bld_logic.h"
59#include "lp_bld_pack.h"
60#include "lp_bld_debug.h"
61#include "lp_bld_arit.h"
62
63
64#define EXP_POLY_DEGREE 5
65
66#define LOG_POLY_DEGREE 4
67
68
69/**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73static LLVMValueRef
74lp_build_min_simple(struct lp_build_context *bld,
75                    LLVMValueRef a,
76                    LLVMValueRef b)
77{
78   LLVMBuilderRef builder = bld->gallivm->builder;
79   const struct lp_type type = bld->type;
80   const char *intrinsic = NULL;
81   LLVMValueRef cond;
82
83   assert(lp_check_value(type, a));
84   assert(lp_check_value(type, b));
85
86   /* TODO: optimize the constant case */
87
88   if(type.width * type.length == 128) {
89      if(type.floating) {
90         if(type.width == 32 && util_cpu_caps.has_sse)
91            intrinsic = "llvm.x86.sse.min.ps";
92         if(type.width == 64 && util_cpu_caps.has_sse2)
93            intrinsic = "llvm.x86.sse2.min.pd";
94      }
95      else {
96         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
97            intrinsic = "llvm.x86.sse2.pminu.b";
98         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
99            intrinsic = "llvm.x86.sse41.pminsb";
100         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
101            intrinsic = "llvm.x86.sse41.pminuw";
102         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
103            intrinsic = "llvm.x86.sse2.pmins.w";
104         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
105            intrinsic = "llvm.x86.sse41.pminud";
106         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
107            intrinsic = "llvm.x86.sse41.pminsd";
108      }
109   }
110
111   if(intrinsic)
112      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
113
114   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
115   return lp_build_select(bld, cond, a, b);
116}
117
118
119/**
120 * Generate max(a, b)
121 * No checks for special case values of a or b = 1 or 0 are done.
122 */
123static LLVMValueRef
124lp_build_max_simple(struct lp_build_context *bld,
125                    LLVMValueRef a,
126                    LLVMValueRef b)
127{
128   LLVMBuilderRef builder = bld->gallivm->builder;
129   const struct lp_type type = bld->type;
130   const char *intrinsic = NULL;
131   LLVMValueRef cond;
132
133   assert(lp_check_value(type, a));
134   assert(lp_check_value(type, b));
135
136   /* TODO: optimize the constant case */
137
138   if(type.width * type.length == 128) {
139      if(type.floating) {
140         if(type.width == 32 && util_cpu_caps.has_sse)
141            intrinsic = "llvm.x86.sse.max.ps";
142         if(type.width == 64 && util_cpu_caps.has_sse2)
143            intrinsic = "llvm.x86.sse2.max.pd";
144      }
145      else {
146         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
147            intrinsic = "llvm.x86.sse2.pmaxu.b";
148         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
149            intrinsic = "llvm.x86.sse41.pmaxsb";
150         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
151            intrinsic = "llvm.x86.sse41.pmaxuw";
152         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
153            intrinsic = "llvm.x86.sse2.pmaxs.w";
154         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
155            intrinsic = "llvm.x86.sse41.pmaxud";
156         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
157            intrinsic = "llvm.x86.sse41.pmaxsd";
158      }
159   }
160
161   if(intrinsic)
162      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
163
164   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
165   return lp_build_select(bld, cond, a, b);
166}
167
168
169/**
170 * Generate 1 - a, or ~a depending on bld->type.
171 */
172LLVMValueRef
173lp_build_comp(struct lp_build_context *bld,
174              LLVMValueRef a)
175{
176   LLVMBuilderRef builder = bld->gallivm->builder;
177   const struct lp_type type = bld->type;
178
179   assert(lp_check_value(type, a));
180
181   if(a == bld->one)
182      return bld->zero;
183   if(a == bld->zero)
184      return bld->one;
185
186   if(type.norm && !type.floating && !type.fixed && !type.sign) {
187      if(LLVMIsConstant(a))
188         return LLVMConstNot(a);
189      else
190         return LLVMBuildNot(builder, a, "");
191   }
192
193   if(LLVMIsConstant(a))
194      if (type.floating)
195          return LLVMConstFSub(bld->one, a);
196      else
197          return LLVMConstSub(bld->one, a);
198   else
199      if (type.floating)
200         return LLVMBuildFSub(builder, bld->one, a, "");
201      else
202         return LLVMBuildSub(builder, bld->one, a, "");
203}
204
205
206/**
207 * Generate a + b
208 */
209LLVMValueRef
210lp_build_add(struct lp_build_context *bld,
211             LLVMValueRef a,
212             LLVMValueRef b)
213{
214   LLVMBuilderRef builder = bld->gallivm->builder;
215   const struct lp_type type = bld->type;
216   LLVMValueRef res;
217
218   assert(lp_check_value(type, a));
219   assert(lp_check_value(type, b));
220
221   if(a == bld->zero)
222      return b;
223   if(b == bld->zero)
224      return a;
225   if(a == bld->undef || b == bld->undef)
226      return bld->undef;
227
228   if(bld->type.norm) {
229      const char *intrinsic = NULL;
230
231      if(a == bld->one || b == bld->one)
232        return bld->one;
233
234      if(util_cpu_caps.has_sse2 &&
235         type.width * type.length == 128 &&
236         !type.floating && !type.fixed) {
237         if(type.width == 8)
238            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
239         if(type.width == 16)
240            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
241      }
242
243      if(intrinsic)
244         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
245   }
246
247   if(LLVMIsConstant(a) && LLVMIsConstant(b))
248      if (type.floating)
249         res = LLVMConstFAdd(a, b);
250      else
251         res = LLVMConstAdd(a, b);
252   else
253      if (type.floating)
254         res = LLVMBuildFAdd(builder, a, b, "");
255      else
256         res = LLVMBuildAdd(builder, a, b, "");
257
258   /* clamp to ceiling of 1.0 */
259   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
260      res = lp_build_min_simple(bld, res, bld->one);
261
262   /* XXX clamp to floor of -1 or 0??? */
263
264   return res;
265}
266
267
268/** Return the scalar sum of the elements of a */
269LLVMValueRef
270lp_build_sum_vector(struct lp_build_context *bld,
271                    LLVMValueRef a)
272{
273   LLVMBuilderRef builder = bld->gallivm->builder;
274   const struct lp_type type = bld->type;
275   LLVMValueRef index, res;
276   unsigned i;
277
278   assert(lp_check_value(type, a));
279
280   if (type.length == 1) {
281      return a;
282   }
283
284   assert(!bld->type.norm);
285
286   index = lp_build_const_int32(bld->gallivm, 0);
287   res = LLVMBuildExtractElement(builder, a, index, "");
288
289   for (i = 1; i < type.length; i++) {
290      index = lp_build_const_int32(bld->gallivm, i);
291      if (type.floating)
292         res = LLVMBuildFAdd(builder, res,
293                            LLVMBuildExtractElement(builder,
294                                                    a, index, ""),
295                            "");
296      else
297         res = LLVMBuildAdd(builder, res,
298                            LLVMBuildExtractElement(builder,
299                                                    a, index, ""),
300                            "");
301   }
302
303   return res;
304}
305
306
307/**
308 * Generate a - b
309 */
310LLVMValueRef
311lp_build_sub(struct lp_build_context *bld,
312             LLVMValueRef a,
313             LLVMValueRef b)
314{
315   LLVMBuilderRef builder = bld->gallivm->builder;
316   const struct lp_type type = bld->type;
317   LLVMValueRef res;
318
319   assert(lp_check_value(type, a));
320   assert(lp_check_value(type, b));
321
322   if(b == bld->zero)
323      return a;
324   if(a == bld->undef || b == bld->undef)
325      return bld->undef;
326   if(a == b)
327      return bld->zero;
328
329   if(bld->type.norm) {
330      const char *intrinsic = NULL;
331
332      if(b == bld->one)
333        return bld->zero;
334
335      if(util_cpu_caps.has_sse2 &&
336         type.width * type.length == 128 &&
337         !type.floating && !type.fixed) {
338         if(type.width == 8)
339            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
340         if(type.width == 16)
341            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
342      }
343
344      if(intrinsic)
345         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
346   }
347
348   if(LLVMIsConstant(a) && LLVMIsConstant(b))
349      if (type.floating)
350         res = LLVMConstFSub(a, b);
351      else
352         res = LLVMConstSub(a, b);
353   else
354      if (type.floating)
355         res = LLVMBuildFSub(builder, a, b, "");
356      else
357         res = LLVMBuildSub(builder, a, b, "");
358
359   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
360      res = lp_build_max_simple(bld, res, bld->zero);
361
362   return res;
363}
364
365
366/**
367 * Normalized 8bit multiplication.
368 *
369 * - alpha plus one
370 *
371 *     makes the following approximation to the division (Sree)
372 *
373 *       a*b/255 ~= (a*(b + 1)) >> 256
374 *
375 *     which is the fastest method that satisfies the following OpenGL criteria
376 *
377 *       0*0 = 0 and 255*255 = 255
378 *
379 * - geometric series
380 *
381 *     takes the geometric series approximation to the division
382 *
383 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
384 *
385 *     in this case just the first two terms to fit in 16bit arithmetic
386 *
387 *       t/255 ~= (t + (t >> 8)) >> 8
388 *
389 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
390 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
391 *     must be used
392 *
393 * - geometric series plus rounding
394 *
395 *     when using a geometric series division instead of truncating the result
396 *     use roundoff in the approximation (Jim Blinn)
397 *
398 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
399 *
400 *     achieving the exact results
401 *
402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
403 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
404 * @sa Michael Herf, The "double blend trick", May 2000,
405 *     http://www.stereopsis.com/doubleblend.html
406 */
407static LLVMValueRef
408lp_build_mul_u8n(struct gallivm_state *gallivm,
409                 struct lp_type i16_type,
410                 LLVMValueRef a, LLVMValueRef b)
411{
412   LLVMBuilderRef builder = gallivm->builder;
413   LLVMValueRef c8;
414   LLVMValueRef ab;
415
416   assert(!i16_type.floating);
417   assert(lp_check_value(i16_type, a));
418   assert(lp_check_value(i16_type, b));
419
420   c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
421
422#if 0
423
424   /* a*b/255 ~= (a*(b + 1)) >> 256 */
425   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
426   ab = LLVMBuildMul(builder, a, b, "");
427
428#else
429
430   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
431   ab = LLVMBuildMul(builder, a, b, "");
432   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
433   ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
434
435#endif
436
437   ab = LLVMBuildLShr(builder, ab, c8, "");
438
439   return ab;
440}
441
442
443/**
444 * Generate a * b
445 */
446LLVMValueRef
447lp_build_mul(struct lp_build_context *bld,
448             LLVMValueRef a,
449             LLVMValueRef b)
450{
451   LLVMBuilderRef builder = bld->gallivm->builder;
452   const struct lp_type type = bld->type;
453   LLVMValueRef shift;
454   LLVMValueRef res;
455
456   assert(lp_check_value(type, a));
457   assert(lp_check_value(type, b));
458
459   if(a == bld->zero)
460      return bld->zero;
461   if(a == bld->one)
462      return b;
463   if(b == bld->zero)
464      return bld->zero;
465   if(b == bld->one)
466      return a;
467   if(a == bld->undef || b == bld->undef)
468      return bld->undef;
469
470   if(!type.floating && !type.fixed && type.norm) {
471      if(type.width == 8) {
472         struct lp_type i16_type = lp_wider_type(type);
473         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
474
475         lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
476         lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
477
478         /* PMULLW, PSRLW, PADDW */
479         abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
480         abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
481
482         ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
483
484         return ab;
485      }
486
487      /* FIXME */
488      assert(0);
489   }
490
491   if(type.fixed)
492      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
493   else
494      shift = NULL;
495
496   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
497      if (type.floating)
498         res = LLVMConstFMul(a, b);
499      else
500         res = LLVMConstMul(a, b);
501      if(shift) {
502         if(type.sign)
503            res = LLVMConstAShr(res, shift);
504         else
505            res = LLVMConstLShr(res, shift);
506      }
507   }
508   else {
509      if (type.floating)
510         res = LLVMBuildFMul(builder, a, b, "");
511      else
512         res = LLVMBuildMul(builder, a, b, "");
513      if(shift) {
514         if(type.sign)
515            res = LLVMBuildAShr(builder, res, shift, "");
516         else
517            res = LLVMBuildLShr(builder, res, shift, "");
518      }
519   }
520
521   return res;
522}
523
524
525/**
526 * Small vector x scale multiplication optimization.
527 */
528LLVMValueRef
529lp_build_mul_imm(struct lp_build_context *bld,
530                 LLVMValueRef a,
531                 int b)
532{
533   LLVMBuilderRef builder = bld->gallivm->builder;
534   LLVMValueRef factor;
535
536   assert(lp_check_value(bld->type, a));
537
538   if(b == 0)
539      return bld->zero;
540
541   if(b == 1)
542      return a;
543
544   if(b == -1)
545      return lp_build_negate(bld, a);
546
547   if(b == 2 && bld->type.floating)
548      return lp_build_add(bld, a, a);
549
550   if(util_is_power_of_two(b)) {
551      unsigned shift = ffs(b) - 1;
552
553      if(bld->type.floating) {
554#if 0
555         /*
556          * Power of two multiplication by directly manipulating the mantissa.
557          *
558          * XXX: This might not be always faster, it will introduce a small error
559          * for multiplication by zero, and it will produce wrong results
560          * for Inf and NaN.
561          */
562         unsigned mantissa = lp_mantissa(bld->type);
563         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
564         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
565         a = LLVMBuildAdd(builder, a, factor, "");
566         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
567         return a;
568#endif
569      }
570      else {
571         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
572         return LLVMBuildShl(builder, a, factor, "");
573      }
574   }
575
576   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
577   return lp_build_mul(bld, a, factor);
578}
579
580
581/**
582 * Generate a / b
583 */
584LLVMValueRef
585lp_build_div(struct lp_build_context *bld,
586             LLVMValueRef a,
587             LLVMValueRef b)
588{
589   LLVMBuilderRef builder = bld->gallivm->builder;
590   const struct lp_type type = bld->type;
591
592   assert(lp_check_value(type, a));
593   assert(lp_check_value(type, b));
594
595   if(a == bld->zero)
596      return bld->zero;
597   if(a == bld->one)
598      return lp_build_rcp(bld, b);
599   if(b == bld->zero)
600      return bld->undef;
601   if(b == bld->one)
602      return a;
603   if(a == bld->undef || b == bld->undef)
604      return bld->undef;
605
606   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
607      if (type.floating)
608         return LLVMConstFDiv(a, b);
609      else if (type.sign)
610         return LLVMConstSDiv(a, b);
611      else
612         return LLVMConstUDiv(a, b);
613   }
614
615   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
616      type.floating)
617      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
618
619   if (type.floating)
620      return LLVMBuildFDiv(builder, a, b, "");
621   else if (type.sign)
622      return LLVMBuildSDiv(builder, a, b, "");
623   else
624      return LLVMBuildUDiv(builder, a, b, "");
625}
626
627
628/**
629 * Linear interpolation -- without any checks.
630 *
631 * @sa http://www.stereopsis.com/doubleblend.html
632 */
633static INLINE LLVMValueRef
634lp_build_lerp_simple(struct lp_build_context *bld,
635                     LLVMValueRef x,
636                     LLVMValueRef v0,
637                     LLVMValueRef v1)
638{
639   LLVMBuilderRef builder = bld->gallivm->builder;
640   LLVMValueRef delta;
641   LLVMValueRef res;
642
643   assert(lp_check_value(bld->type, x));
644   assert(lp_check_value(bld->type, v0));
645   assert(lp_check_value(bld->type, v1));
646
647   delta = lp_build_sub(bld, v1, v0);
648
649   res = lp_build_mul(bld, x, delta);
650
651   res = lp_build_add(bld, v0, res);
652
653   if (bld->type.fixed) {
654      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
655       * but it will be wrong for other uses. Basically we need a more
656       * powerful lp_type, capable of further distinguishing the values
657       * interpretation from the value storage. */
658      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
659   }
660
661   return res;
662}
663
664
665/**
666 * Linear interpolation.
667 */
668LLVMValueRef
669lp_build_lerp(struct lp_build_context *bld,
670              LLVMValueRef x,
671              LLVMValueRef v0,
672              LLVMValueRef v1)
673{
674   LLVMBuilderRef builder = bld->gallivm->builder;
675   const struct lp_type type = bld->type;
676   LLVMValueRef res;
677
678   assert(lp_check_value(type, x));
679   assert(lp_check_value(type, v0));
680   assert(lp_check_value(type, v1));
681
682   if (type.norm) {
683      struct lp_type wide_type;
684      struct lp_build_context wide_bld;
685      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
686      LLVMValueRef shift;
687
688      assert(type.length >= 2);
689      assert(!type.sign);
690
691      /*
692       * Create a wider type, enough to hold the intermediate result of the
693       * multiplication.
694       */
695      memset(&wide_type, 0, sizeof wide_type);
696      wide_type.fixed  = TRUE;
697      wide_type.width  = type.width*2;
698      wide_type.length = type.length/2;
699
700      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
701
702      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
703      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
704      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
705
706      /*
707       * Scale x from [0, 255] to [0, 256]
708       */
709
710      shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
711
712      xl = lp_build_add(&wide_bld, xl,
713                        LLVMBuildAShr(builder, xl, shift, ""));
714      xh = lp_build_add(&wide_bld, xh,
715                        LLVMBuildAShr(builder, xh, shift, ""));
716
717      /*
718       * Lerp both halves.
719       */
720
721      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
722      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
723
724      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
725   } else {
726      res = lp_build_lerp_simple(bld, x, v0, v1);
727   }
728
729   return res;
730}
731
732
733LLVMValueRef
734lp_build_lerp_2d(struct lp_build_context *bld,
735                 LLVMValueRef x,
736                 LLVMValueRef y,
737                 LLVMValueRef v00,
738                 LLVMValueRef v01,
739                 LLVMValueRef v10,
740                 LLVMValueRef v11)
741{
742   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
743   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
744   return lp_build_lerp(bld, y, v0, v1);
745}
746
747
748/**
749 * Generate min(a, b)
750 * Do checks for special cases.
751 */
752LLVMValueRef
753lp_build_min(struct lp_build_context *bld,
754             LLVMValueRef a,
755             LLVMValueRef b)
756{
757   assert(lp_check_value(bld->type, a));
758   assert(lp_check_value(bld->type, b));
759
760   if(a == bld->undef || b == bld->undef)
761      return bld->undef;
762
763   if(a == b)
764      return a;
765
766   if (bld->type.norm) {
767      if (!bld->type.sign) {
768         if (a == bld->zero || b == bld->zero) {
769            return bld->zero;
770         }
771      }
772      if(a == bld->one)
773         return b;
774      if(b == bld->one)
775         return a;
776   }
777
778   return lp_build_min_simple(bld, a, b);
779}
780
781
782/**
783 * Generate max(a, b)
784 * Do checks for special cases.
785 */
786LLVMValueRef
787lp_build_max(struct lp_build_context *bld,
788             LLVMValueRef a,
789             LLVMValueRef b)
790{
791   assert(lp_check_value(bld->type, a));
792   assert(lp_check_value(bld->type, b));
793
794   if(a == bld->undef || b == bld->undef)
795      return bld->undef;
796
797   if(a == b)
798      return a;
799
800   if(bld->type.norm) {
801      if(a == bld->one || b == bld->one)
802         return bld->one;
803      if (!bld->type.sign) {
804         if (a == bld->zero) {
805            return b;
806         }
807         if (b == bld->zero) {
808            return a;
809         }
810      }
811   }
812
813   return lp_build_max_simple(bld, a, b);
814}
815
816
817/**
818 * Generate clamp(a, min, max)
819 * Do checks for special cases.
820 */
821LLVMValueRef
822lp_build_clamp(struct lp_build_context *bld,
823               LLVMValueRef a,
824               LLVMValueRef min,
825               LLVMValueRef max)
826{
827   assert(lp_check_value(bld->type, a));
828   assert(lp_check_value(bld->type, min));
829   assert(lp_check_value(bld->type, max));
830
831   a = lp_build_min(bld, a, max);
832   a = lp_build_max(bld, a, min);
833   return a;
834}
835
836
837/**
838 * Generate abs(a)
839 */
840LLVMValueRef
841lp_build_abs(struct lp_build_context *bld,
842             LLVMValueRef a)
843{
844   LLVMBuilderRef builder = bld->gallivm->builder;
845   const struct lp_type type = bld->type;
846   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
847
848   assert(lp_check_value(type, a));
849
850   if(!type.sign)
851      return a;
852
853   if(type.floating) {
854      /* Mask out the sign bit */
855      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
856      unsigned long long absMask = ~(1ULL << (type.width - 1));
857      LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
858      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
859      a = LLVMBuildAnd(builder, a, mask, "");
860      a = LLVMBuildBitCast(builder, a, vec_type, "");
861      return a;
862   }
863
864   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
865      switch(type.width) {
866      case 8:
867         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
868      case 16:
869         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
870      case 32:
871         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
872      }
873   }
874
875   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
876}
877
878
879LLVMValueRef
880lp_build_negate(struct lp_build_context *bld,
881                LLVMValueRef a)
882{
883   LLVMBuilderRef builder = bld->gallivm->builder;
884
885   assert(lp_check_value(bld->type, a));
886
887#if HAVE_LLVM >= 0x0207
888   if (bld->type.floating)
889      a = LLVMBuildFNeg(builder, a, "");
890   else
891#endif
892      a = LLVMBuildNeg(builder, a, "");
893
894   return a;
895}
896
897
898/** Return -1, 0 or +1 depending on the sign of a */
899LLVMValueRef
900lp_build_sgn(struct lp_build_context *bld,
901             LLVMValueRef a)
902{
903   LLVMBuilderRef builder = bld->gallivm->builder;
904   const struct lp_type type = bld->type;
905   LLVMValueRef cond;
906   LLVMValueRef res;
907
908   assert(lp_check_value(type, a));
909
910   /* Handle non-zero case */
911   if(!type.sign) {
912      /* if not zero then sign must be positive */
913      res = bld->one;
914   }
915   else if(type.floating) {
916      LLVMTypeRef vec_type;
917      LLVMTypeRef int_type;
918      LLVMValueRef mask;
919      LLVMValueRef sign;
920      LLVMValueRef one;
921      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
922
923      int_type = lp_build_int_vec_type(bld->gallivm, type);
924      vec_type = lp_build_vec_type(bld->gallivm, type);
925      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
926
927      /* Take the sign bit and add it to 1 constant */
928      sign = LLVMBuildBitCast(builder, a, int_type, "");
929      sign = LLVMBuildAnd(builder, sign, mask, "");
930      one = LLVMConstBitCast(bld->one, int_type);
931      res = LLVMBuildOr(builder, sign, one, "");
932      res = LLVMBuildBitCast(builder, res, vec_type, "");
933   }
934   else
935   {
936      /* signed int/norm/fixed point */
937      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
938      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
939      res = lp_build_select(bld, cond, bld->one, minus_one);
940   }
941
942   /* Handle zero */
943   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
944   res = lp_build_select(bld, cond, bld->zero, res);
945
946   return res;
947}
948
949
950/**
951 * Set the sign of float vector 'a' according to 'sign'.
952 * If sign==0, return abs(a).
953 * If sign==1, return -abs(a);
954 * Other values for sign produce undefined results.
955 */
956LLVMValueRef
957lp_build_set_sign(struct lp_build_context *bld,
958                  LLVMValueRef a, LLVMValueRef sign)
959{
960   LLVMBuilderRef builder = bld->gallivm->builder;
961   const struct lp_type type = bld->type;
962   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
963   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
964   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
965   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
966                             ~((unsigned long long) 1 << (type.width - 1)));
967   LLVMValueRef val, res;
968
969   assert(type.floating);
970   assert(lp_check_value(type, a));
971
972   /* val = reinterpret_cast<int>(a) */
973   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
974   /* val = val & mask */
975   val = LLVMBuildAnd(builder, val, mask, "");
976   /* sign = sign << shift */
977   sign = LLVMBuildShl(builder, sign, shift, "");
978   /* res = val | sign */
979   res = LLVMBuildOr(builder, val, sign, "");
980   /* res = reinterpret_cast<float>(res) */
981   res = LLVMBuildBitCast(builder, res, vec_type, "");
982
983   return res;
984}
985
986
987/**
988 * Convert vector of (or scalar) int to vector of (or scalar) float.
989 */
990LLVMValueRef
991lp_build_int_to_float(struct lp_build_context *bld,
992                      LLVMValueRef a)
993{
994   LLVMBuilderRef builder = bld->gallivm->builder;
995   const struct lp_type type = bld->type;
996   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
997
998   assert(type.floating);
999
1000   return LLVMBuildSIToFP(builder, a, vec_type, "");
1001}
1002
1003
1004
1005enum lp_build_round_sse41_mode
1006{
1007   LP_BUILD_ROUND_SSE41_NEAREST = 0,
1008   LP_BUILD_ROUND_SSE41_FLOOR = 1,
1009   LP_BUILD_ROUND_SSE41_CEIL = 2,
1010   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1011};
1012
1013
1014/**
1015 * Helper for SSE4.1's ROUNDxx instructions.
1016 *
1017 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1018 * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1019 */
1020static INLINE LLVMValueRef
1021lp_build_round_sse41(struct lp_build_context *bld,
1022                     LLVMValueRef a,
1023                     enum lp_build_round_sse41_mode mode)
1024{
1025   LLVMBuilderRef builder = bld->gallivm->builder;
1026   const struct lp_type type = bld->type;
1027   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1028   const char *intrinsic;
1029   LLVMValueRef res;
1030
1031   assert(type.floating);
1032
1033   assert(lp_check_value(type, a));
1034   assert(util_cpu_caps.has_sse4_1);
1035
1036   if (type.length == 1) {
1037      LLVMTypeRef vec_type;
1038      LLVMValueRef undef;
1039      LLVMValueRef args[3];
1040      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1041
1042      switch(type.width) {
1043      case 32:
1044         intrinsic = "llvm.x86.sse41.round.ss";
1045         break;
1046      case 64:
1047         intrinsic = "llvm.x86.sse41.round.sd";
1048         break;
1049      default:
1050         assert(0);
1051         return bld->undef;
1052      }
1053
1054      vec_type = LLVMVectorType(bld->elem_type, 4);
1055
1056      undef = LLVMGetUndef(vec_type);
1057
1058      args[0] = undef;
1059      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1060      args[2] = LLVMConstInt(i32t, mode, 0);
1061
1062      res = lp_build_intrinsic(builder, intrinsic,
1063                               vec_type, args, Elements(args));
1064
1065      res = LLVMBuildExtractElement(builder, res, index0, "");
1066   }
1067   else {
1068      assert(type.width*type.length == 128);
1069
1070      switch(type.width) {
1071      case 32:
1072         intrinsic = "llvm.x86.sse41.round.ps";
1073         break;
1074      case 64:
1075         intrinsic = "llvm.x86.sse41.round.pd";
1076         break;
1077      default:
1078         assert(0);
1079         return bld->undef;
1080      }
1081
1082      res = lp_build_intrinsic_binary(builder, intrinsic,
1083                                      bld->vec_type, a,
1084                                      LLVMConstInt(i32t, mode, 0));
1085   }
1086
1087   return res;
1088}
1089
1090
1091static INLINE LLVMValueRef
1092lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1093                             LLVMValueRef a)
1094{
1095   LLVMBuilderRef builder = bld->gallivm->builder;
1096   const struct lp_type type = bld->type;
1097   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1098   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1099   const char *intrinsic;
1100   LLVMValueRef res;
1101
1102   assert(type.floating);
1103   /* using the double precision conversions is a bit more complicated */
1104   assert(type.width == 32);
1105
1106   assert(lp_check_value(type, a));
1107   assert(util_cpu_caps.has_sse2);
1108
1109   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1110   if (type.length == 1) {
1111      LLVMTypeRef vec_type;
1112      LLVMValueRef undef;
1113      LLVMValueRef arg;
1114      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1115
1116      vec_type = LLVMVectorType(bld->elem_type, 4);
1117
1118      intrinsic = "llvm.x86.sse.cvtss2si";
1119
1120      undef = LLVMGetUndef(vec_type);
1121
1122      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1123
1124      res = lp_build_intrinsic_unary(builder, intrinsic,
1125                                     ret_type, arg);
1126   }
1127   else {
1128      assert(type.width*type.length == 128);
1129
1130      intrinsic = "llvm.x86.sse2.cvtps2dq";
1131
1132      res = lp_build_intrinsic_unary(builder, intrinsic,
1133                                     ret_type, a);
1134   }
1135
1136   return res;
1137}
1138
1139
1140/**
1141 * Return the integer part of a float (vector) value (== round toward zero).
1142 * The returned value is a float (vector).
1143 * Ex: trunc(-1.5) = -1.0
1144 */
1145LLVMValueRef
1146lp_build_trunc(struct lp_build_context *bld,
1147               LLVMValueRef a)
1148{
1149   LLVMBuilderRef builder = bld->gallivm->builder;
1150   const struct lp_type type = bld->type;
1151
1152   assert(type.floating);
1153   assert(lp_check_value(type, a));
1154
1155   if (util_cpu_caps.has_sse4_1 &&
1156       (type.length == 1 || type.width*type.length == 128)) {
1157      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1158   }
1159   else {
1160      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1161      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1162      LLVMValueRef res;
1163      res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1164      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1165      return res;
1166   }
1167}
1168
1169
1170/**
1171 * Return float (vector) rounded to nearest integer (vector).  The returned
1172 * value is a float (vector).
1173 * Ex: round(0.9) = 1.0
1174 * Ex: round(-1.5) = -2.0
1175 */
1176LLVMValueRef
1177lp_build_round(struct lp_build_context *bld,
1178               LLVMValueRef a)
1179{
1180   LLVMBuilderRef builder = bld->gallivm->builder;
1181   const struct lp_type type = bld->type;
1182
1183   assert(type.floating);
1184   assert(lp_check_value(type, a));
1185
1186   if (util_cpu_caps.has_sse4_1 &&
1187       (type.length == 1 || type.width*type.length == 128)) {
1188      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1189   }
1190   else {
1191      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1192      LLVMValueRef res;
1193      res = lp_build_iround(bld, a);
1194      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1195      return res;
1196   }
1197}
1198
1199
1200/**
1201 * Return floor of float (vector), result is a float (vector)
1202 * Ex: floor(1.1) = 1.0
1203 * Ex: floor(-1.1) = -2.0
1204 */
1205LLVMValueRef
1206lp_build_floor(struct lp_build_context *bld,
1207               LLVMValueRef a)
1208{
1209   LLVMBuilderRef builder = bld->gallivm->builder;
1210   const struct lp_type type = bld->type;
1211
1212   assert(type.floating);
1213   assert(lp_check_value(type, a));
1214
1215   if (util_cpu_caps.has_sse4_1 &&
1216       (type.length == 1 || type.width*type.length == 128)) {
1217      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1218   }
1219   else {
1220      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1221      LLVMValueRef res;
1222      res = lp_build_ifloor(bld, a);
1223      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1224      return res;
1225   }
1226}
1227
1228
1229/**
1230 * Return ceiling of float (vector), returning float (vector).
1231 * Ex: ceil( 1.1) = 2.0
1232 * Ex: ceil(-1.1) = -1.0
1233 */
1234LLVMValueRef
1235lp_build_ceil(struct lp_build_context *bld,
1236              LLVMValueRef a)
1237{
1238   LLVMBuilderRef builder = bld->gallivm->builder;
1239   const struct lp_type type = bld->type;
1240
1241   assert(type.floating);
1242   assert(lp_check_value(type, a));
1243
1244   if (util_cpu_caps.has_sse4_1 &&
1245       (type.length == 1 || type.width*type.length == 128)) {
1246      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1247   }
1248   else {
1249      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1250      LLVMValueRef res;
1251      res = lp_build_iceil(bld, a);
1252      res = LLVMBuildSIToFP(builder, res, vec_type, "");
1253      return res;
1254   }
1255}
1256
1257
1258/**
1259 * Return fractional part of 'a' computed as a - floor(a)
1260 * Typically used in texture coord arithmetic.
1261 */
1262LLVMValueRef
1263lp_build_fract(struct lp_build_context *bld,
1264               LLVMValueRef a)
1265{
1266   assert(bld->type.floating);
1267   return lp_build_sub(bld, a, lp_build_floor(bld, a));
1268}
1269
1270
1271/**
1272 * Return the integer part of a float (vector) value (== round toward zero).
1273 * The returned value is an integer (vector).
1274 * Ex: itrunc(-1.5) = -1
1275 */
1276LLVMValueRef
1277lp_build_itrunc(struct lp_build_context *bld,
1278                LLVMValueRef a)
1279{
1280   LLVMBuilderRef builder = bld->gallivm->builder;
1281   const struct lp_type type = bld->type;
1282   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1283
1284   assert(type.floating);
1285   assert(lp_check_value(type, a));
1286
1287   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1288}
1289
1290
1291/**
1292 * Return float (vector) rounded to nearest integer (vector).  The returned
1293 * value is an integer (vector).
1294 * Ex: iround(0.9) = 1
1295 * Ex: iround(-1.5) = -2
1296 */
1297LLVMValueRef
1298lp_build_iround(struct lp_build_context *bld,
1299                LLVMValueRef a)
1300{
1301   LLVMBuilderRef builder = bld->gallivm->builder;
1302   const struct lp_type type = bld->type;
1303   LLVMTypeRef int_vec_type = bld->int_vec_type;
1304   LLVMValueRef res;
1305
1306   assert(type.floating);
1307
1308   assert(lp_check_value(type, a));
1309
1310   if (util_cpu_caps.has_sse2 &&
1311       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1312      return lp_build_iround_nearest_sse2(bld, a);
1313   }
1314   else if (util_cpu_caps.has_sse4_1 &&
1315       (type.length == 1 || type.width*type.length == 128)) {
1316      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1317   }
1318   else {
1319      LLVMValueRef half;
1320
1321      half = lp_build_const_vec(bld->gallivm, type, 0.5);
1322
1323      if (type.sign) {
1324         LLVMTypeRef vec_type = bld->vec_type;
1325         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1326                                    (unsigned long long)1 << (type.width - 1));
1327         LLVMValueRef sign;
1328
1329         /* get sign bit */
1330         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1331         sign = LLVMBuildAnd(builder, sign, mask, "");
1332
1333         /* sign * 0.5 */
1334         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1335         half = LLVMBuildOr(builder, sign, half, "");
1336         half = LLVMBuildBitCast(builder, half, vec_type, "");
1337      }
1338
1339      res = LLVMBuildFAdd(builder, a, half, "");
1340   }
1341
1342   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1343
1344   return res;
1345}
1346
1347
1348/**
1349 * Return floor of float (vector), result is an int (vector)
1350 * Ex: ifloor(1.1) = 1.0
1351 * Ex: ifloor(-1.1) = -2.0
1352 */
1353LLVMValueRef
1354lp_build_ifloor(struct lp_build_context *bld,
1355                LLVMValueRef a)
1356{
1357   LLVMBuilderRef builder = bld->gallivm->builder;
1358   const struct lp_type type = bld->type;
1359   LLVMTypeRef int_vec_type = bld->int_vec_type;
1360   LLVMValueRef res;
1361
1362   assert(type.floating);
1363   assert(lp_check_value(type, a));
1364
1365   if (util_cpu_caps.has_sse4_1 &&
1366       (type.length == 1 || type.width*type.length == 128)) {
1367      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1368   }
1369   else {
1370      res = a;
1371
1372      if (type.sign) {
1373         /* Take the sign bit and add it to 1 constant */
1374         LLVMTypeRef vec_type = bld->vec_type;
1375         unsigned mantissa = lp_mantissa(type);
1376         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1377                                  (unsigned long long)1 << (type.width - 1));
1378         LLVMValueRef sign;
1379         LLVMValueRef offset;
1380
1381         /* sign = a < 0 ? ~0 : 0 */
1382         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1383         sign = LLVMBuildAnd(builder, sign, mask, "");
1384         sign = LLVMBuildAShr(builder, sign,
1385                              lp_build_const_int_vec(bld->gallivm, type,
1386                                                     type.width - 1),
1387                              "ifloor.sign");
1388
1389         /* offset = -0.99999(9)f */
1390         offset = lp_build_const_vec(bld->gallivm, type,
1391                                     -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1392         offset = LLVMConstBitCast(offset, int_vec_type);
1393
1394         /* offset = a < 0 ? offset : 0.0f */
1395         offset = LLVMBuildAnd(builder, offset, sign, "");
1396         offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1397
1398         res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1399      }
1400   }
1401
1402   /* round to nearest (toward zero) */
1403   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1404
1405   return res;
1406}
1407
1408
1409/**
1410 * Return ceiling of float (vector), returning int (vector).
1411 * Ex: iceil( 1.1) = 2
1412 * Ex: iceil(-1.1) = -1
1413 */
1414LLVMValueRef
1415lp_build_iceil(struct lp_build_context *bld,
1416               LLVMValueRef a)
1417{
1418   LLVMBuilderRef builder = bld->gallivm->builder;
1419   const struct lp_type type = bld->type;
1420   LLVMTypeRef int_vec_type = bld->int_vec_type;
1421   LLVMValueRef res;
1422
1423   assert(type.floating);
1424   assert(lp_check_value(type, a));
1425
1426   if (util_cpu_caps.has_sse4_1 &&
1427       (type.length == 1 || type.width*type.length == 128)) {
1428      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1429   }
1430   else {
1431      LLVMTypeRef vec_type = bld->vec_type;
1432      unsigned mantissa = lp_mantissa(type);
1433      LLVMValueRef offset;
1434
1435      /* offset = 0.99999(9)f */
1436      offset = lp_build_const_vec(bld->gallivm, type,
1437                                  (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1438
1439      if (type.sign) {
1440         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1441                                (unsigned long long)1 << (type.width - 1));
1442         LLVMValueRef sign;
1443
1444         /* sign = a < 0 ? 0 : ~0 */
1445         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1446         sign = LLVMBuildAnd(builder, sign, mask, "");
1447         sign = LLVMBuildAShr(builder, sign,
1448                              lp_build_const_int_vec(bld->gallivm, type,
1449                                                     type.width - 1),
1450                              "iceil.sign");
1451         sign = LLVMBuildNot(builder, sign, "iceil.not");
1452
1453         /* offset = a < 0 ? 0.0 : offset */
1454         offset = LLVMConstBitCast(offset, int_vec_type);
1455         offset = LLVMBuildAnd(builder, offset, sign, "");
1456         offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1457      }
1458
1459      res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1460   }
1461
1462   /* round to nearest (toward zero) */
1463   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1464
1465   return res;
1466}
1467
1468
1469/**
1470 * Combined ifloor() & fract().
1471 *
1472 * Preferred to calling the functions separately, as it will ensure that the
1473 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1474 */
1475void
1476lp_build_ifloor_fract(struct lp_build_context *bld,
1477                      LLVMValueRef a,
1478                      LLVMValueRef *out_ipart,
1479                      LLVMValueRef *out_fpart)
1480{
1481   LLVMBuilderRef builder = bld->gallivm->builder;
1482   const struct lp_type type = bld->type;
1483   LLVMValueRef ipart;
1484
1485   assert(type.floating);
1486   assert(lp_check_value(type, a));
1487
1488   if (util_cpu_caps.has_sse4_1 &&
1489       (type.length == 1 || type.width*type.length == 128)) {
1490      /*
1491       * floor() is easier.
1492       */
1493
1494      ipart = lp_build_floor(bld, a);
1495      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1496      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1497   }
1498   else {
1499      /*
1500       * ifloor() is easier.
1501       */
1502
1503      *out_ipart = lp_build_ifloor(bld, a);
1504      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1505      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1506   }
1507}
1508
1509
1510LLVMValueRef
1511lp_build_sqrt(struct lp_build_context *bld,
1512              LLVMValueRef a)
1513{
1514   LLVMBuilderRef builder = bld->gallivm->builder;
1515   const struct lp_type type = bld->type;
1516   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1517   char intrinsic[32];
1518
1519   assert(lp_check_value(type, a));
1520
1521   /* TODO: optimize the constant case */
1522   /* TODO: optimize the constant case */
1523
1524   assert(type.floating);
1525   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1526
1527   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1528}
1529
1530
1531/**
1532 * Do one Newton-Raphson step to improve reciprocate precision:
1533 *
1534 *   x_{i+1} = x_i * (2 - a * x_i)
1535 *
1536 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1537 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1538 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1539 * halo. It would be necessary to clamp the argument to prevent this.
1540 *
1541 * See also:
1542 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1543 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1544 */
1545static INLINE LLVMValueRef
1546lp_build_rcp_refine(struct lp_build_context *bld,
1547                    LLVMValueRef a,
1548                    LLVMValueRef rcp_a)
1549{
1550   LLVMBuilderRef builder = bld->gallivm->builder;
1551   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1552   LLVMValueRef res;
1553
1554   res = LLVMBuildFMul(builder, a, rcp_a, "");
1555   res = LLVMBuildFSub(builder, two, res, "");
1556   res = LLVMBuildFMul(builder, rcp_a, res, "");
1557
1558   return res;
1559}
1560
1561
1562LLVMValueRef
1563lp_build_rcp(struct lp_build_context *bld,
1564             LLVMValueRef a)
1565{
1566   LLVMBuilderRef builder = bld->gallivm->builder;
1567   const struct lp_type type = bld->type;
1568
1569   assert(lp_check_value(type, a));
1570
1571   if(a == bld->zero)
1572      return bld->undef;
1573   if(a == bld->one)
1574      return bld->one;
1575   if(a == bld->undef)
1576      return bld->undef;
1577
1578   assert(type.floating);
1579
1580   if(LLVMIsConstant(a))
1581      return LLVMConstFDiv(bld->one, a);
1582
1583   /*
1584    * We don't use RCPPS because:
1585    * - it only has 10bits of precision
1586    * - it doesn't even get the reciprocate of 1.0 exactly
1587    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1588    * - for recent processors the benefit over DIVPS is marginal, a case
1589    *   depedent
1590    *
1591    * We could still use it on certain processors if benchmarks show that the
1592    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1593    * particular uses that require less workarounds.
1594    */
1595
1596   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1597      const unsigned num_iterations = 0;
1598      LLVMValueRef res;
1599      unsigned i;
1600
1601      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1602
1603      for (i = 0; i < num_iterations; ++i) {
1604         res = lp_build_rcp_refine(bld, a, res);
1605      }
1606
1607      return res;
1608   }
1609
1610   return LLVMBuildFDiv(builder, bld->one, a, "");
1611}
1612
1613
1614/**
1615 * Do one Newton-Raphson step to improve rsqrt precision:
1616 *
1617 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1618 *
1619 * See also:
1620 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1621 */
1622static INLINE LLVMValueRef
1623lp_build_rsqrt_refine(struct lp_build_context *bld,
1624                      LLVMValueRef a,
1625                      LLVMValueRef rsqrt_a)
1626{
1627   LLVMBuilderRef builder = bld->gallivm->builder;
1628   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1629   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1630   LLVMValueRef res;
1631
1632   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1633   res = LLVMBuildFMul(builder, a, res, "");
1634   res = LLVMBuildFSub(builder, three, res, "");
1635   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1636   res = LLVMBuildFMul(builder, half, res, "");
1637
1638   return res;
1639}
1640
1641
1642/**
1643 * Generate 1/sqrt(a)
1644 */
1645LLVMValueRef
1646lp_build_rsqrt(struct lp_build_context *bld,
1647               LLVMValueRef a)
1648{
1649   LLVMBuilderRef builder = bld->gallivm->builder;
1650   const struct lp_type type = bld->type;
1651
1652   assert(lp_check_value(type, a));
1653
1654   assert(type.floating);
1655
1656   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1657      const unsigned num_iterations = 1;
1658      LLVMValueRef res;
1659      unsigned i;
1660
1661      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1662
1663      for (i = 0; i < num_iterations; ++i) {
1664         res = lp_build_rsqrt_refine(bld, a, res);
1665      }
1666
1667      return res;
1668   }
1669
1670   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1671}
1672
1673
1674/**
1675 * Generate sin(a) using SSE2
1676 */
1677LLVMValueRef
1678lp_build_sin(struct lp_build_context *bld,
1679             LLVMValueRef a)
1680{
1681   struct gallivm_state *gallivm = bld->gallivm;
1682   LLVMBuilderRef builder = gallivm->builder;
1683   struct lp_type int_type = lp_int_type(bld->type);
1684   LLVMBuilderRef b = builder;
1685
1686   /*
1687    *  take the absolute value,
1688    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1689    */
1690
1691   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1692   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1693
1694   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1695   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1696
1697   /*
1698    * extract the sign bit (upper one)
1699    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1700    */
1701   LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1702   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1703
1704   /*
1705    * scale by 4/Pi
1706    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1707    */
1708
1709   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1710   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1711
1712   /*
1713    * store the integer part of y in mm0
1714    * emm2 = _mm_cvttps_epi32(y);
1715    */
1716
1717   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1718
1719   /*
1720    * j=(j+1) & (~1) (see the cephes sources)
1721    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1722    */
1723
1724   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1725   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1726   /*
1727    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1728    */
1729   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1730   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1731
1732   /*
1733    * y = _mm_cvtepi32_ps(emm2);
1734    */
1735   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1736
1737   /* get the swap sign flag
1738    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1739    */
1740   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1741   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1742
1743   /*
1744    * emm2 = _mm_slli_epi32(emm0, 29);
1745    */
1746   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1747   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1748
1749   /*
1750    * get the polynom selection mask
1751    * there is one polynom for 0 <= x <= Pi/4
1752    * and another one for Pi/4<x<=Pi/2
1753    * Both branches will be computed.
1754    *
1755    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1756    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1757    */
1758
1759   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1760   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1761   LLVMValueRef poly_mask = lp_build_compare(gallivm,
1762                                             int_type, PIPE_FUNC_EQUAL,
1763                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1764   /*
1765    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1766    */
1767   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1768
1769   /*
1770    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1771    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1772    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1773    */
1774   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1775   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1776   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1777
1778   /*
1779    * The magic pass: "Extended precision modular arithmetic"
1780    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1781    * xmm1 = _mm_mul_ps(y, xmm1);
1782    * xmm2 = _mm_mul_ps(y, xmm2);
1783    * xmm3 = _mm_mul_ps(y, xmm3);
1784    */
1785   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1786   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1787   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1788
1789   /*
1790    * x = _mm_add_ps(x, xmm1);
1791    * x = _mm_add_ps(x, xmm2);
1792    * x = _mm_add_ps(x, xmm3);
1793    */
1794
1795   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1796   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1797   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1798
1799   /*
1800    * Evaluate the first polynom  (0 <= x <= Pi/4)
1801    *
1802    * z = _mm_mul_ps(x,x);
1803    */
1804   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1805
1806   /*
1807    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1808    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1809    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1810    */
1811   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1812   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1813   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1814
1815   /*
1816    * y = *(v4sf*)_ps_coscof_p0;
1817    * y = _mm_mul_ps(y, z);
1818    */
1819   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1820   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1821   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1822   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1823   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1824   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1825
1826
1827   /*
1828    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1829    * y = _mm_sub_ps(y, tmp);
1830    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1831    */
1832   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1833   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1834   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1835   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1836   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1837
1838   /*
1839    * _PS_CONST(sincof_p0, -1.9515295891E-4);
1840    * _PS_CONST(sincof_p1,  8.3321608736E-3);
1841    * _PS_CONST(sincof_p2, -1.6666654611E-1);
1842    */
1843   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1844   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1845   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1846
1847   /*
1848    * Evaluate the second polynom  (Pi/4 <= x <= 0)
1849    *
1850    * y2 = *(v4sf*)_ps_sincof_p0;
1851    * y2 = _mm_mul_ps(y2, z);
1852    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1853    * y2 = _mm_mul_ps(y2, z);
1854    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1855    * y2 = _mm_mul_ps(y2, z);
1856    * y2 = _mm_mul_ps(y2, x);
1857    * y2 = _mm_add_ps(y2, x);
1858    */
1859
1860   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1861   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1862   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1863   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1864   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1865   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1866   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1867
1868   /*
1869    * select the correct result from the two polynoms
1870    * xmm3 = poly_mask;
1871    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1872    * y = _mm_andnot_ps(xmm3, y);
1873    * y = _mm_add_ps(y,y2);
1874    */
1875   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1876   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1877   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1878   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1879   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1880   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1881   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1882
1883   /*
1884    * update the sign
1885    * y = _mm_xor_ps(y, sign_bit);
1886    */
1887   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1888   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1889   return y_result;
1890}
1891
1892
1893/**
1894 * Generate cos(a) using SSE2
1895 */
1896LLVMValueRef
1897lp_build_cos(struct lp_build_context *bld,
1898             LLVMValueRef a)
1899{
1900   struct gallivm_state *gallivm = bld->gallivm;
1901   LLVMBuilderRef builder = gallivm->builder;
1902   struct lp_type int_type = lp_int_type(bld->type);
1903   LLVMBuilderRef b = builder;
1904
1905   /*
1906    *  take the absolute value,
1907    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1908    */
1909
1910   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1911   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1912
1913   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1914   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1915
1916   /*
1917    * scale by 4/Pi
1918    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1919    */
1920
1921   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1922   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1923
1924   /*
1925    * store the integer part of y in mm0
1926    * emm2 = _mm_cvttps_epi32(y);
1927    */
1928
1929   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1930
1931   /*
1932    * j=(j+1) & (~1) (see the cephes sources)
1933    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1934    */
1935
1936   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1937   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1938   /*
1939    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1940    */
1941   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1942   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1943
1944   /*
1945    * y = _mm_cvtepi32_ps(emm2);
1946    */
1947   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1948
1949
1950   /*
1951    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1952    */
1953   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1954   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1955
1956
1957   /* get the swap sign flag
1958    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1959    */
1960   LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1961   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1962   LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1963   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1964
1965   /*
1966    * emm2 = _mm_slli_epi32(emm0, 29);
1967    */
1968   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1969   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1970
1971   /*
1972    * get the polynom selection mask
1973    * there is one polynom for 0 <= x <= Pi/4
1974    * and another one for Pi/4<x<=Pi/2
1975    * Both branches will be computed.
1976    *
1977    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1978    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1979    */
1980
1981   LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1982   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1983   LLVMValueRef poly_mask = lp_build_compare(gallivm,
1984                                             int_type, PIPE_FUNC_EQUAL,
1985   				             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1986
1987   /*
1988    * _PS_CONST(minus_cephes_DP1, -0.78515625);
1989    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1990    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1991    */
1992   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1993   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1994   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1995
1996   /*
1997    * The magic pass: "Extended precision modular arithmetic"
1998    * x = ((x - y * DP1) - y * DP2) - y * DP3;
1999    * xmm1 = _mm_mul_ps(y, xmm1);
2000    * xmm2 = _mm_mul_ps(y, xmm2);
2001    * xmm3 = _mm_mul_ps(y, xmm3);
2002    */
2003   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2004   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2005   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2006
2007   /*
2008    * x = _mm_add_ps(x, xmm1);
2009    * x = _mm_add_ps(x, xmm2);
2010    * x = _mm_add_ps(x, xmm3);
2011    */
2012
2013   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2014   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2015   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2016
2017   /*
2018    * Evaluate the first polynom  (0 <= x <= Pi/4)
2019    *
2020    * z = _mm_mul_ps(x,x);
2021    */
2022   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2023
2024   /*
2025    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2026    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2027    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2028    */
2029   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2030   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2031   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2032
2033   /*
2034    * y = *(v4sf*)_ps_coscof_p0;
2035    * y = _mm_mul_ps(y, z);
2036    */
2037   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2038   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2039   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2040   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2041   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2042   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2043
2044
2045   /*
2046    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2047    * y = _mm_sub_ps(y, tmp);
2048    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2049    */
2050   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2051   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2052   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2053   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2054   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2055
2056   /*
2057    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2058    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2059    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2060    */
2061   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2062   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2063   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2064
2065   /*
2066    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2067    *
2068    * y2 = *(v4sf*)_ps_sincof_p0;
2069    * y2 = _mm_mul_ps(y2, z);
2070    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2071    * y2 = _mm_mul_ps(y2, z);
2072    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2073    * y2 = _mm_mul_ps(y2, z);
2074    * y2 = _mm_mul_ps(y2, x);
2075    * y2 = _mm_add_ps(y2, x);
2076    */
2077
2078   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2079   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2080   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2081   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2082   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2083   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2084   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2085
2086   /*
2087    * select the correct result from the two polynoms
2088    * xmm3 = poly_mask;
2089    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2090    * y = _mm_andnot_ps(xmm3, y);
2091    * y = _mm_add_ps(y,y2);
2092    */
2093   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2094   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2095   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2096   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2097   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2098   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2099
2100   /*
2101    * update the sign
2102    * y = _mm_xor_ps(y, sign_bit);
2103    */
2104   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2105   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2106   return y_result;
2107}
2108
2109
2110/**
2111 * Generate pow(x, y)
2112 */
2113LLVMValueRef
2114lp_build_pow(struct lp_build_context *bld,
2115             LLVMValueRef x,
2116             LLVMValueRef y)
2117{
2118   /* TODO: optimize the constant case */
2119   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2120       LLVMIsConstant(x) && LLVMIsConstant(y)) {
2121      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2122                   __FUNCTION__);
2123   }
2124
2125   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2126}
2127
2128
2129/**
2130 * Generate exp(x)
2131 */
2132LLVMValueRef
2133lp_build_exp(struct lp_build_context *bld,
2134             LLVMValueRef x)
2135{
2136   /* log2(e) = 1/log(2) */
2137   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2138                                           1.4426950408889634);
2139
2140   assert(lp_check_value(bld->type, x));
2141
2142   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2143}
2144
2145
2146/**
2147 * Generate log(x)
2148 */
2149LLVMValueRef
2150lp_build_log(struct lp_build_context *bld,
2151             LLVMValueRef x)
2152{
2153   /* log(2) */
2154   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2155                                          0.69314718055994529);
2156
2157   assert(lp_check_value(bld->type, x));
2158
2159   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2160}
2161
2162
2163/**
2164 * Generate polynomial.
2165 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2166 */
2167static LLVMValueRef
2168lp_build_polynomial(struct lp_build_context *bld,
2169                    LLVMValueRef x,
2170                    const double *coeffs,
2171                    unsigned num_coeffs)
2172{
2173   const struct lp_type type = bld->type;
2174   LLVMValueRef even = NULL, odd = NULL;
2175   LLVMValueRef x2;
2176   unsigned i;
2177
2178   assert(lp_check_value(bld->type, x));
2179
2180   /* TODO: optimize the constant case */
2181   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2182       LLVMIsConstant(x)) {
2183      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2184                   __FUNCTION__);
2185   }
2186
2187   /*
2188    * Calculate odd and even terms seperately to decrease data dependency
2189    * Ex:
2190    *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2191    *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2192    */
2193   x2 = lp_build_mul(bld, x, x);
2194
2195   for (i = num_coeffs; i--; ) {
2196      LLVMValueRef coeff;
2197
2198      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2199
2200      if (i % 2 == 0) {
2201         if (even)
2202            even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2203         else
2204            even = coeff;
2205      } else {
2206         if (odd)
2207            odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2208         else
2209            odd = coeff;
2210      }
2211   }
2212
2213   if (odd)
2214      return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2215   else if (even)
2216      return even;
2217   else
2218      return bld->undef;
2219}
2220
2221
2222/**
2223 * Minimax polynomial fit of 2**x, in range [0, 1[
2224 */
2225const double lp_build_exp2_polynomial[] = {
2226#if EXP_POLY_DEGREE == 5
2227   0.999999925063526176901,
2228   0.693153073200168932794,
2229   0.240153617044375388211,
2230   0.0558263180532956664775,
2231   0.00898934009049466391101,
2232   0.00187757667519147912699
2233#elif EXP_POLY_DEGREE == 4
2234   1.00000259337069434683,
2235   0.693003834469974940458,
2236   0.24144275689150793076,
2237   0.0520114606103070150235,
2238   0.0135341679161270268764
2239#elif EXP_POLY_DEGREE == 3
2240   0.999925218562710312959,
2241   0.695833540494823811697,
2242   0.226067155427249155588,
2243   0.0780245226406372992967
2244#elif EXP_POLY_DEGREE == 2
2245   1.00172476321474503578,
2246   0.657636275736077639316,
2247   0.33718943461968720704
2248#else
2249#error
2250#endif
2251};
2252
2253
2254void
2255lp_build_exp2_approx(struct lp_build_context *bld,
2256                     LLVMValueRef x,
2257                     LLVMValueRef *p_exp2_int_part,
2258                     LLVMValueRef *p_frac_part,
2259                     LLVMValueRef *p_exp2)
2260{
2261   LLVMBuilderRef builder = bld->gallivm->builder;
2262   const struct lp_type type = bld->type;
2263   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2264   LLVMValueRef ipart = NULL;
2265   LLVMValueRef fpart = NULL;
2266   LLVMValueRef expipart = NULL;
2267   LLVMValueRef expfpart = NULL;
2268   LLVMValueRef res = NULL;
2269
2270   assert(lp_check_value(bld->type, x));
2271
2272   if(p_exp2_int_part || p_frac_part || p_exp2) {
2273      /* TODO: optimize the constant case */
2274      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2275          LLVMIsConstant(x)) {
2276         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2277                      __FUNCTION__);
2278      }
2279
2280      assert(type.floating && type.width == 32);
2281
2282      x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2283      x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2284
2285      /* ipart = floor(x) */
2286      /* fpart = x - ipart */
2287      lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2288   }
2289
2290   if(p_exp2_int_part || p_exp2) {
2291      /* expipart = (float) (1 << ipart) */
2292      expipart = LLVMBuildAdd(builder, ipart,
2293                              lp_build_const_int_vec(bld->gallivm, type, 127), "");
2294      expipart = LLVMBuildShl(builder, expipart,
2295                              lp_build_const_int_vec(bld->gallivm, type, 23), "");
2296      expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2297   }
2298
2299   if(p_exp2) {
2300      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2301                                     Elements(lp_build_exp2_polynomial));
2302
2303      res = LLVMBuildFMul(builder, expipart, expfpart, "");
2304   }
2305
2306   if(p_exp2_int_part)
2307      *p_exp2_int_part = expipart;
2308
2309   if(p_frac_part)
2310      *p_frac_part = fpart;
2311
2312   if(p_exp2)
2313      *p_exp2 = res;
2314}
2315
2316
2317LLVMValueRef
2318lp_build_exp2(struct lp_build_context *bld,
2319              LLVMValueRef x)
2320{
2321   LLVMValueRef res;
2322   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2323   return res;
2324}
2325
2326
2327/**
2328 * Extract the exponent of a IEEE-754 floating point value.
2329 *
2330 * Optionally apply an integer bias.
2331 *
2332 * Result is an integer value with
2333 *
2334 *   ifloor(log2(x)) + bias
2335 */
2336LLVMValueRef
2337lp_build_extract_exponent(struct lp_build_context *bld,
2338                          LLVMValueRef x,
2339                          int bias)
2340{
2341   LLVMBuilderRef builder = bld->gallivm->builder;
2342   const struct lp_type type = bld->type;
2343   unsigned mantissa = lp_mantissa(type);
2344   LLVMValueRef res;
2345
2346   assert(type.floating);
2347
2348   assert(lp_check_value(bld->type, x));
2349
2350   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2351
2352   res = LLVMBuildLShr(builder, x,
2353                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2354   res = LLVMBuildAnd(builder, res,
2355                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
2356   res = LLVMBuildSub(builder, res,
2357                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2358
2359   return res;
2360}
2361
2362
2363/**
2364 * Extract the mantissa of the a floating.
2365 *
2366 * Result is a floating point value with
2367 *
2368 *   x / floor(log2(x))
2369 */
2370LLVMValueRef
2371lp_build_extract_mantissa(struct lp_build_context *bld,
2372                          LLVMValueRef x)
2373{
2374   LLVMBuilderRef builder = bld->gallivm->builder;
2375   const struct lp_type type = bld->type;
2376   unsigned mantissa = lp_mantissa(type);
2377   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2378                                                  (1ULL << mantissa) - 1);
2379   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2380   LLVMValueRef res;
2381
2382   assert(lp_check_value(bld->type, x));
2383
2384   assert(type.floating);
2385
2386   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2387
2388   /* res = x / 2**ipart */
2389   res = LLVMBuildAnd(builder, x, mantmask, "");
2390   res = LLVMBuildOr(builder, res, one, "");
2391   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2392
2393   return res;
2394}
2395
2396
2397
2398/**
2399 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2400 * These coefficients can be generate with
2401 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2402 */
2403const double lp_build_log2_polynomial[] = {
2404#if LOG_POLY_DEGREE == 5
2405   2.88539008148777786488L,
2406   0.961796878841293367824L,
2407   0.577058946784739859012L,
2408   0.412914355135828735411L,
2409   0.308591899232910175289L,
2410   0.352376952300281371868L,
2411#elif LOG_POLY_DEGREE == 4
2412   2.88539009343309178325L,
2413   0.961791550404184197881L,
2414   0.577440339438736392009L,
2415   0.403343858251329912514L,
2416   0.406718052498846252698L,
2417#elif LOG_POLY_DEGREE == 3
2418   2.88538959748872753838L,
2419   0.961932915889597772928L,
2420   0.571118517972136195241L,
2421   0.493997535084709500285L,
2422#else
2423#error
2424#endif
2425};
2426
2427/**
2428 * See http://www.devmaster.net/forums/showthread.php?p=43580
2429 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2430 * http://www.nezumi.demon.co.uk/consult/logx.htm
2431 */
2432void
2433lp_build_log2_approx(struct lp_build_context *bld,
2434                     LLVMValueRef x,
2435                     LLVMValueRef *p_exp,
2436                     LLVMValueRef *p_floor_log2,
2437                     LLVMValueRef *p_log2)
2438{
2439   LLVMBuilderRef builder = bld->gallivm->builder;
2440   const struct lp_type type = bld->type;
2441   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2442   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2443
2444   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2445   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2446   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2447
2448   LLVMValueRef i = NULL;
2449   LLVMValueRef y = NULL;
2450   LLVMValueRef z = NULL;
2451   LLVMValueRef exp = NULL;
2452   LLVMValueRef mant = NULL;
2453   LLVMValueRef logexp = NULL;
2454   LLVMValueRef logmant = NULL;
2455   LLVMValueRef res = NULL;
2456
2457   assert(lp_check_value(bld->type, x));
2458
2459   if(p_exp || p_floor_log2 || p_log2) {
2460      /* TODO: optimize the constant case */
2461      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2462          LLVMIsConstant(x)) {
2463         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2464                      __FUNCTION__);
2465      }
2466
2467      assert(type.floating && type.width == 32);
2468
2469      /*
2470       * We don't explicitly handle denormalized numbers. They will yield a
2471       * result in the neighbourhood of -127, which appears to be adequate
2472       * enough.
2473       */
2474
2475      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2476
2477      /* exp = (float) exponent(x) */
2478      exp = LLVMBuildAnd(builder, i, expmask, "");
2479   }
2480
2481   if(p_floor_log2 || p_log2) {
2482      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2483      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2484      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2485   }
2486
2487   if(p_log2) {
2488      /* mant = 1 + (float) mantissa(x) */
2489      mant = LLVMBuildAnd(builder, i, mantmask, "");
2490      mant = LLVMBuildOr(builder, mant, one, "");
2491      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2492
2493      /* y = (mant - 1) / (mant + 1) */
2494      y = lp_build_div(bld,
2495         lp_build_sub(bld, mant, bld->one),
2496         lp_build_add(bld, mant, bld->one)
2497      );
2498
2499      /* z = y^2 */
2500      z = lp_build_mul(bld, y, y);
2501
2502      /* compute P(z) */
2503      logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
2504                                    Elements(lp_build_log2_polynomial));
2505
2506      /* logmant = y * P(z) */
2507      logmant = lp_build_mul(bld, y, logmant);
2508
2509      res = lp_build_add(bld, logmant, logexp);
2510   }
2511
2512   if(p_exp) {
2513      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2514      *p_exp = exp;
2515   }
2516
2517   if(p_floor_log2)
2518      *p_floor_log2 = logexp;
2519
2520   if(p_log2)
2521      *p_log2 = res;
2522}
2523
2524
2525LLVMValueRef
2526lp_build_log2(struct lp_build_context *bld,
2527              LLVMValueRef x)
2528{
2529   LLVMValueRef res;
2530   lp_build_log2_approx(bld, x, NULL, NULL, &res);
2531   return res;
2532}
2533
2534
2535/**
2536 * Faster (and less accurate) log2.
2537 *
2538 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2539 *
2540 * Piece-wise linear approximation, with exact results when x is a
2541 * power of two.
2542 *
2543 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2544 */
2545LLVMValueRef
2546lp_build_fast_log2(struct lp_build_context *bld,
2547                   LLVMValueRef x)
2548{
2549   LLVMBuilderRef builder = bld->gallivm->builder;
2550   LLVMValueRef ipart;
2551   LLVMValueRef fpart;
2552
2553   assert(lp_check_value(bld->type, x));
2554
2555   assert(bld->type.floating);
2556
2557   /* ipart = floor(log2(x)) - 1 */
2558   ipart = lp_build_extract_exponent(bld, x, -1);
2559   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2560
2561   /* fpart = x / 2**ipart */
2562   fpart = lp_build_extract_mantissa(bld, x);
2563
2564   /* ipart + fpart */
2565   return LLVMBuildFAdd(builder, ipart, fpart, "");
2566}
2567
2568
2569/**
2570 * Fast implementation of iround(log2(x)).
2571 *
2572 * Not an approximation -- it should give accurate results all the time.
2573 */
2574LLVMValueRef
2575lp_build_ilog2(struct lp_build_context *bld,
2576               LLVMValueRef x)
2577{
2578   LLVMBuilderRef builder = bld->gallivm->builder;
2579   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2580   LLVMValueRef ipart;
2581
2582   assert(bld->type.floating);
2583
2584   assert(lp_check_value(bld->type, x));
2585
2586   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2587   x = LLVMBuildFMul(builder, x, sqrt2, "");
2588
2589   /* ipart = floor(log2(x) + 0.5)  */
2590   ipart = lp_build_extract_exponent(bld, x, 0);
2591
2592   return ipart;
2593}
2594
2595LLVMValueRef
2596lp_build_mod(struct lp_build_context *bld,
2597             LLVMValueRef x,
2598             LLVMValueRef y)
2599{
2600   LLVMBuilderRef builder = bld->gallivm->builder;
2601   LLVMValueRef res;
2602   const struct lp_type type = bld->type;
2603
2604   assert(lp_check_value(type, x));
2605   assert(lp_check_value(type, y));
2606
2607   if (type.floating)
2608      res = LLVMBuildFRem(builder, x, y, "");
2609   else if (type.sign)
2610      res = LLVMBuildSRem(builder, x, y, "");
2611   else
2612      res = LLVMBuildURem(builder, x, y, "");
2613   return res;
2614}
2615