1
2/*---------------------------------------------------------------*/
3/*--- begin                              host_generic_maddf.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   Compute x * y + z as ternary operation.
8   Copyright (C) 2010-2013 Free Software Foundation, Inc.
9   This file is part of the GNU C Library.
10   Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
11
12   The GNU C Library is free software; you can redistribute it and/or
13   modify it under the terms of the GNU Lesser General Public
14   License as published by the Free Software Foundation; either
15   version 2.1 of the License, or (at your option) any later version.
16
17   The GNU C Library is distributed in the hope that it will be useful,
18   but WITHOUT ANY WARRANTY; without even the implied warranty of
19   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20   Lesser General Public License for more details.
21
22   You should have received a copy of the GNU Lesser General Public
23   License along with the GNU C Library; if not, see
24   <http://www.gnu.org/licenses/>.
25*/
26
27/* Generic helper functions for doing FMA, i.e. compute x * y + z
28   as ternary operation.
29   These are purely back-end entities and cannot be seen/referenced
30   from IR. */
31
32#include "libvex_basictypes.h"
33#include "host_generic_maddf.h"
34#include "main_util.h"
35
36/* This implementation relies on Double being more than twice as
37   precise as Float and uses rounding to odd in order to avoid problems
38   with double rounding.
39   See a paper by Boldo and Melquiond:
40   http://www.lri.fr/~melquion/doc/08-tc.pdf  */
41
42#define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
43
44#if defined(__x86_64__) && defined(__SSE2_MATH__)
45# define ENV_TYPE unsigned int
46/* Save current rounding mode into ENV, hold exceptions, set rounding
47   mode to rounding toward zero.  */
48# define ROUNDTOZERO(env) \
49   do {							\
50      unsigned int mxcsr;				\
51      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
52      (env) = mxcsr;					\
53      mxcsr = (mxcsr | 0x7f80) & ~0x3f;			\
54      __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
55   } while (0)
56/* Restore exceptions from ENV, return if inexact exception has been raised
57   since ROUNDTOZERO.  */
58# define RESET_TESTINEXACT(env) \
59   ({							\
60      unsigned int mxcsr, ret;				\
61      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
62      ret = (mxcsr >> 5) & 1;				\
63      mxcsr = (mxcsr & 0x3d) | (env);			\
64      __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
65      ret;						\
66   })
67/* Return if inexact exception has been raised since ROUNDTOZERO.  */
68# define TESTINEXACT() \
69   ({							\
70      unsigned int mxcsr;				\
71      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
72      (mxcsr >> 5) & 1;					\
73   })
74#endif
75
76#define DBL_MANT_DIG 53
77#define IEEE754_DOUBLE_BIAS 0x3ff
78
79union vg_ieee754_double {
80   Double d;
81
82   /* This is the IEEE 754 double-precision format.  */
83   struct {
84#ifdef VKI_BIG_ENDIAN
85      unsigned int negative:1;
86      unsigned int exponent:11;
87      unsigned int mantissa0:20;
88      unsigned int mantissa1:32;
89#else
90      unsigned int mantissa1:32;
91      unsigned int mantissa0:20;
92      unsigned int exponent:11;
93      unsigned int negative:1;
94#endif
95   } ieee;
96};
97
98void VEX_REGPARM(3)
99     h_generic_calc_MAddF32 ( /*OUT*/Float* res,
100                               Float* argX, Float* argY, Float* argZ )
101{
102#ifndef ENV_TYPE
103   /* Lame fallback implementation.  */
104   *res = *argX * *argY + *argZ;
105#else
106   ENV_TYPE env;
107   /* Multiplication is always exact.  */
108   Double temp = (Double) *argX * (Double) *argY;
109   union vg_ieee754_double u;
110
111   ROUNDTOZERO (env);
112
113   /* Perform addition with round to odd.  */
114   u.d = temp + (Double) *argZ;
115   /* Ensure the addition is not scheduled after fetestexcept call.  */
116   FORCE_EVAL (u.d);
117
118   /* Reset rounding mode and test for inexact simultaneously.  */
119   int j = RESET_TESTINEXACT (env);
120
121   if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
122      u.ieee.mantissa1 |= j;
123
124   /* And finally truncation with round to nearest.  */
125   *res = (Float) u.d;
126#endif
127}
128
129
130void VEX_REGPARM(3)
131     h_generic_calc_MAddF64 ( /*OUT*/Double* res,
132                               Double* argX, Double* argY, Double* argZ )
133{
134#ifndef ENV_TYPE
135   /* Lame fallback implementation.  */
136   *res = *argX * *argY + *argZ;
137#else
138   Double x = *argX, y = *argY, z = *argZ;
139   union vg_ieee754_double u, v, w;
140   int adjust = 0;
141   u.d = x;
142   v.d = y;
143   w.d = z;
144   if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
145                 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
146       || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
147       || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
148       || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
149       || UNLIKELY (u.ieee.exponent + v.ieee.exponent
150                    <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
151      /* If z is Inf, but x and y are finite, the result should be
152         z rather than NaN.  */
153      if (w.ieee.exponent == 0x7ff
154          && u.ieee.exponent != 0x7ff
155          && v.ieee.exponent != 0x7ff) {
156         *res = (z + x) + y;
157         return;
158      }
159      /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
160         or if x * y is less than half of DBL_DENORM_MIN,
161         compute as x * y + z.  */
162      if (u.ieee.exponent == 0x7ff
163          || v.ieee.exponent == 0x7ff
164          || w.ieee.exponent == 0x7ff
165          || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
166          || u.ieee.exponent + v.ieee.exponent
167             < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
168         *res = x * y + z;
169         return;
170      }
171      if (u.ieee.exponent + v.ieee.exponent
172          >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
173         /* Compute 1p-53 times smaller result and multiply
174            at the end.  */
175         if (u.ieee.exponent > v.ieee.exponent)
176            u.ieee.exponent -= DBL_MANT_DIG;
177         else
178            v.ieee.exponent -= DBL_MANT_DIG;
179         /* If x + y exponent is very large and z exponent is very small,
180            it doesn't matter if we don't adjust it.  */
181         if (w.ieee.exponent > DBL_MANT_DIG)
182            w.ieee.exponent -= DBL_MANT_DIG;
183         adjust = 1;
184      } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
185         /* Similarly.
186            If z exponent is very large and x and y exponents are
187            very small, it doesn't matter if we don't adjust it.  */
188         if (u.ieee.exponent > v.ieee.exponent) {
189            if (u.ieee.exponent > DBL_MANT_DIG)
190               u.ieee.exponent -= DBL_MANT_DIG;
191         } else if (v.ieee.exponent > DBL_MANT_DIG)
192            v.ieee.exponent -= DBL_MANT_DIG;
193         w.ieee.exponent -= DBL_MANT_DIG;
194         adjust = 1;
195      } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
196         u.ieee.exponent -= DBL_MANT_DIG;
197         if (v.ieee.exponent)
198            v.ieee.exponent += DBL_MANT_DIG;
199         else
200            v.d *= 0x1p53;
201      } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
202         v.ieee.exponent -= DBL_MANT_DIG;
203         if (u.ieee.exponent)
204            u.ieee.exponent += DBL_MANT_DIG;
205         else
206            u.d *= 0x1p53;
207      } else /* if (u.ieee.exponent + v.ieee.exponent
208                    <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
209         if (u.ieee.exponent > v.ieee.exponent)
210            u.ieee.exponent += 2 * DBL_MANT_DIG;
211         else
212            v.ieee.exponent += 2 * DBL_MANT_DIG;
213         if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
214            if (w.ieee.exponent)
215               w.ieee.exponent += 2 * DBL_MANT_DIG;
216            else
217               w.d *= 0x1p106;
218            adjust = -1;
219         }
220         /* Otherwise x * y should just affect inexact
221            and nothing else.  */
222      }
223      x = u.d;
224      y = v.d;
225      z = w.d;
226   }
227   /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
228#  define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
229   Double x1 = x * C;
230   Double y1 = y * C;
231   Double m1 = x * y;
232   x1 = (x - x1) + x1;
233   y1 = (y - y1) + y1;
234   Double x2 = x - x1;
235   Double y2 = y - y1;
236   Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
237#  undef C
238
239   /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
240   Double a1 = z + m1;
241   Double t1 = a1 - z;
242   Double t2 = a1 - t1;
243   t1 = m1 - t1;
244   t2 = z - t2;
245   Double a2 = t1 + t2;
246
247   ENV_TYPE env;
248   ROUNDTOZERO (env);
249
250   /* Perform m2 + a2 addition with round to odd.  */
251   u.d = a2 + m2;
252
253   if (UNLIKELY (adjust < 0)) {
254      if ((u.ieee.mantissa1 & 1) == 0)
255         u.ieee.mantissa1 |= TESTINEXACT ();
256      v.d = a1 + u.d;
257      /* Ensure the addition is not scheduled after fetestexcept call.  */
258      FORCE_EVAL (v.d);
259   }
260
261   /* Reset rounding mode and test for inexact simultaneously.  */
262   int j = RESET_TESTINEXACT (env) != 0;
263
264   if (LIKELY (adjust == 0)) {
265      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
266         u.ieee.mantissa1 |= j;
267      /* Result is a1 + u.d.  */
268      *res = a1 + u.d;
269   } else if (LIKELY (adjust > 0)) {
270      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
271         u.ieee.mantissa1 |= j;
272      /* Result is a1 + u.d, scaled up.  */
273      *res = (a1 + u.d) * 0x1p53;
274   } else {
275      /* If a1 + u.d is exact, the only rounding happens during
276         scaling down.  */
277      if (j == 0) {
278         *res = v.d * 0x1p-106;
279         return;
280      }
281      /* If result rounded to zero is not subnormal, no double
282         rounding will occur.  */
283      if (v.ieee.exponent > 106) {
284         *res = (a1 + u.d) * 0x1p-106;
285         return;
286      }
287      /* If v.d * 0x1p-106 with round to zero is a subnormal above
288         or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
289         down just by 1 bit, which means v.ieee.mantissa1 |= j would
290         change the round bit, not sticky or guard bit.
291         v.d * 0x1p-106 never normalizes by shifting up,
292         so round bit plus sticky bit should be already enough
293         for proper rounding.  */
294      if (v.ieee.exponent == 106) {
295         /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
296            v.ieee.mantissa1 & 1 is the round bit and j is our sticky
297            bit.  In round-to-nearest 001 rounds down like 00,
298            011 rounds up, even though 01 rounds down (thus we need
299            to adjust), 101 rounds down like 10 and 111 rounds up
300            like 11.  */
301         if ((v.ieee.mantissa1 & 3) == 1) {
302            v.d *= 0x1p-106;
303            if (v.ieee.negative)
304               *res = v.d - 0x1p-1074;
305            else
306               *res = v.d + 0x1p-1074;
307         } else
308            *res = v.d * 0x1p-106;
309         return;
310      }
311      v.ieee.mantissa1 |= j;
312      *res = v.d * 0x1p-106;
313      return;
314    }
315#endif
316}
317
318/*---------------------------------------------------------------*/
319/*--- end                                 host_generic_maddf.c --*/
320/*---------------------------------------------------------------*/
321