softfloat.c revision c27f813900a3c114562efbb8df1065e94766fc48
1
2/*============================================================================
3
4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5Package, Release 2b.
6
7Written by John R. Hauser.  This work was made possible in part by the
8International Computer Science Institute, located at Suite 600, 1947 Center
9Street, Berkeley, California 94704.  Funding was partially provided by the
10National Science Foundation under grant MIP-9311980.  The original version
11of this code was written as part of a project to build a fixed-point vector
12processor in collaboration with the University of California at Berkeley,
13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15arithmetic/SoftFloat.html'.
16
17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26Derivative works are acceptable, even for commercial purposes, so long as
27(1) the source code for the derivative work includes prominent notice that
28the work is derivative, and (2) the source code includes prominent notice with
29these four paragraphs for those parts of this code that are retained.
30
31=============================================================================*/
32
33#include "softfloat.h"
34
35/*----------------------------------------------------------------------------
36| Primitive arithmetic functions, including multi-word arithmetic, and
37| division and square root approximations.  (Can be specialized to target if
38| desired.)
39*----------------------------------------------------------------------------*/
40#include "softfloat-macros.h"
41
42/*----------------------------------------------------------------------------
43| Functions and definitions to determine:  (1) whether tininess for underflow
44| is detected before or after rounding by default, (2) what (if anything)
45| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47| are propagated from function inputs to output.  These details are target-
48| specific.
49*----------------------------------------------------------------------------*/
50#include "softfloat-specialize.h"
51
52void set_float_rounding_mode(int val STATUS_PARAM)
53{
54    STATUS(float_rounding_mode) = val;
55}
56
57void set_float_exception_flags(int val STATUS_PARAM)
58{
59    STATUS(float_exception_flags) = val;
60}
61
62#ifdef FLOATX80
63void set_floatx80_rounding_precision(int val STATUS_PARAM)
64{
65    STATUS(floatx80_rounding_precision) = val;
66}
67#endif
68
69/*----------------------------------------------------------------------------
70| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71| and 7, and returns the properly rounded 32-bit integer corresponding to the
72| input.  If `zSign' is 1, the input is negated before being converted to an
73| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
74| is simply rounded to an integer, with the inexact exception raised if the
75| input cannot be represented exactly as an integer.  However, if the fixed-
76| point input is too large, the invalid exception is raised and the largest
77| positive or negative integer is returned.
78*----------------------------------------------------------------------------*/
79
80static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81{
82    int8 roundingMode;
83    flag roundNearestEven;
84    int8 roundIncrement, roundBits;
85    int32 z;
86
87    roundingMode = STATUS(float_rounding_mode);
88    roundNearestEven = ( roundingMode == float_round_nearest_even );
89    roundIncrement = 0x40;
90    if ( ! roundNearestEven ) {
91        if ( roundingMode == float_round_to_zero ) {
92            roundIncrement = 0;
93        }
94        else {
95            roundIncrement = 0x7F;
96            if ( zSign ) {
97                if ( roundingMode == float_round_up ) roundIncrement = 0;
98            }
99            else {
100                if ( roundingMode == float_round_down ) roundIncrement = 0;
101            }
102        }
103    }
104    roundBits = absZ & 0x7F;
105    absZ = ( absZ + roundIncrement )>>7;
106    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107    z = absZ;
108    if ( zSign ) z = - z;
109    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110        float_raise( float_flag_invalid STATUS_VAR);
111        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112    }
113    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114    return z;
115
116}
117
118/*----------------------------------------------------------------------------
119| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120| `absZ1', with binary point between bits 63 and 64 (between the input words),
121| and returns the properly rounded 64-bit integer corresponding to the input.
122| If `zSign' is 1, the input is negated before being converted to an integer.
123| Ordinarily, the fixed-point input is simply rounded to an integer, with
124| the inexact exception raised if the input cannot be represented exactly as
125| an integer.  However, if the fixed-point input is too large, the invalid
126| exception is raised and the largest positive or negative integer is
127| returned.
128*----------------------------------------------------------------------------*/
129
130static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131{
132    int8 roundingMode;
133    flag roundNearestEven, increment;
134    int64 z;
135
136    roundingMode = STATUS(float_rounding_mode);
137    roundNearestEven = ( roundingMode == float_round_nearest_even );
138    increment = ( (sbits64) absZ1 < 0 );
139    if ( ! roundNearestEven ) {
140        if ( roundingMode == float_round_to_zero ) {
141            increment = 0;
142        }
143        else {
144            if ( zSign ) {
145                increment = ( roundingMode == float_round_down ) && absZ1;
146            }
147            else {
148                increment = ( roundingMode == float_round_up ) && absZ1;
149            }
150        }
151    }
152    if ( increment ) {
153        ++absZ0;
154        if ( absZ0 == 0 ) goto overflow;
155        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156    }
157    z = absZ0;
158    if ( zSign ) z = - z;
159    if ( z && ( ( z < 0 ) ^ zSign ) ) {
160 overflow:
161        float_raise( float_flag_invalid STATUS_VAR);
162        return
163              zSign ? (sbits64) LIT64( 0x8000000000000000 )
164            : LIT64( 0x7FFFFFFFFFFFFFFF );
165    }
166    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167    return z;
168
169}
170
171/*----------------------------------------------------------------------------
172| Returns the fraction bits of the single-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175INLINE bits32 extractFloat32Frac( float32 a )
176{
177
178    return float32_val(a) & 0x007FFFFF;
179
180}
181
182/*----------------------------------------------------------------------------
183| Returns the exponent bits of the single-precision floating-point value `a'.
184*----------------------------------------------------------------------------*/
185
186INLINE int16 extractFloat32Exp( float32 a )
187{
188
189    return ( float32_val(a)>>23 ) & 0xFF;
190
191}
192
193/*----------------------------------------------------------------------------
194| Returns the sign bit of the single-precision floating-point value `a'.
195*----------------------------------------------------------------------------*/
196
197INLINE flag extractFloat32Sign( float32 a )
198{
199
200    return float32_val(a)>>31;
201
202}
203
204/*----------------------------------------------------------------------------
205| Normalizes the subnormal single-precision floating-point value represented
206| by the denormalized significand `aSig'.  The normalized exponent and
207| significand are stored at the locations pointed to by `zExpPtr' and
208| `zSigPtr', respectively.
209*----------------------------------------------------------------------------*/
210
211static void
212 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213{
214    int8 shiftCount;
215
216    shiftCount = countLeadingZeros32( aSig ) - 8;
217    *zSigPtr = aSig<<shiftCount;
218    *zExpPtr = 1 - shiftCount;
219
220}
221
222/*----------------------------------------------------------------------------
223| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224| single-precision floating-point value, returning the result.  After being
225| shifted into the proper positions, the three fields are simply added
226| together to form the result.  This means that any integer portion of `zSig'
227| will be added into the exponent.  Since a properly normalized significand
228| will have an integer portion equal to 1, the `zExp' input should be 1 less
229| than the desired result exponent whenever `zSig' is a complete, normalized
230| significand.
231*----------------------------------------------------------------------------*/
232
233INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234{
235
236    return make_float32(
237          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
238
239}
240
241/*----------------------------------------------------------------------------
242| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
243| and significand `zSig', and returns the proper single-precision floating-
244| point value corresponding to the abstract input.  Ordinarily, the abstract
245| value is simply rounded and packed into the single-precision format, with
246| the inexact exception raised if the abstract input cannot be represented
247| exactly.  However, if the abstract value is too large, the overflow and
248| inexact exceptions are raised and an infinity or maximal finite value is
249| returned.  If the abstract value is too small, the input value is rounded to
250| a subnormal number, and the underflow and inexact exceptions are raised if
251| the abstract input cannot be represented exactly as a subnormal single-
252| precision floating-point number.
253|     The input significand `zSig' has its binary point between bits 30
254| and 29, which is 7 bits to the left of the usual location.  This shifted
255| significand must be normalized or smaller.  If `zSig' is not normalized,
256| `zExp' must be 0; in that case, the result returned is a subnormal number,
257| and it must not require rounding.  In the usual case that `zSig' is
258| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
259| The handling of underflow and overflow follows the IEC/IEEE Standard for
260| Binary Floating-Point Arithmetic.
261*----------------------------------------------------------------------------*/
262
263static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
264{
265    int8 roundingMode;
266    flag roundNearestEven;
267    int8 roundIncrement, roundBits;
268    flag isTiny;
269
270    roundingMode = STATUS(float_rounding_mode);
271    roundNearestEven = ( roundingMode == float_round_nearest_even );
272    roundIncrement = 0x40;
273    if ( ! roundNearestEven ) {
274        if ( roundingMode == float_round_to_zero ) {
275            roundIncrement = 0;
276        }
277        else {
278            roundIncrement = 0x7F;
279            if ( zSign ) {
280                if ( roundingMode == float_round_up ) roundIncrement = 0;
281            }
282            else {
283                if ( roundingMode == float_round_down ) roundIncrement = 0;
284            }
285        }
286    }
287    roundBits = zSig & 0x7F;
288    if ( 0xFD <= (bits16) zExp ) {
289        if (    ( 0xFD < zExp )
290             || (    ( zExp == 0xFD )
291                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
292           ) {
293            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
294            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
295        }
296        if ( zExp < 0 ) {
297            isTiny =
298                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
299                || ( zExp < -1 )
300                || ( zSig + roundIncrement < 0x80000000 );
301            shift32RightJamming( zSig, - zExp, &zSig );
302            zExp = 0;
303            roundBits = zSig & 0x7F;
304            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
305        }
306    }
307    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
308    zSig = ( zSig + roundIncrement )>>7;
309    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
310    if ( zSig == 0 ) zExp = 0;
311    return packFloat32( zSign, zExp, zSig );
312
313}
314
315/*----------------------------------------------------------------------------
316| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
317| and significand `zSig', and returns the proper single-precision floating-
318| point value corresponding to the abstract input.  This routine is just like
319| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
320| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
321| floating-point exponent.
322*----------------------------------------------------------------------------*/
323
324static float32
325 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
326{
327    int8 shiftCount;
328
329    shiftCount = countLeadingZeros32( zSig ) - 1;
330    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
331
332}
333
334/*----------------------------------------------------------------------------
335| Returns the fraction bits of the double-precision floating-point value `a'.
336*----------------------------------------------------------------------------*/
337
338INLINE bits64 extractFloat64Frac( float64 a )
339{
340
341    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
342
343}
344
345/*----------------------------------------------------------------------------
346| Returns the exponent bits of the double-precision floating-point value `a'.
347*----------------------------------------------------------------------------*/
348
349INLINE int16 extractFloat64Exp( float64 a )
350{
351
352    return ( float64_val(a)>>52 ) & 0x7FF;
353
354}
355
356/*----------------------------------------------------------------------------
357| Returns the sign bit of the double-precision floating-point value `a'.
358*----------------------------------------------------------------------------*/
359
360INLINE flag extractFloat64Sign( float64 a )
361{
362
363    return float64_val(a)>>63;
364
365}
366
367/*----------------------------------------------------------------------------
368| Normalizes the subnormal double-precision floating-point value represented
369| by the denormalized significand `aSig'.  The normalized exponent and
370| significand are stored at the locations pointed to by `zExpPtr' and
371| `zSigPtr', respectively.
372*----------------------------------------------------------------------------*/
373
374static void
375 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
376{
377    int8 shiftCount;
378
379    shiftCount = countLeadingZeros64( aSig ) - 11;
380    *zSigPtr = aSig<<shiftCount;
381    *zExpPtr = 1 - shiftCount;
382
383}
384
385/*----------------------------------------------------------------------------
386| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
387| double-precision floating-point value, returning the result.  After being
388| shifted into the proper positions, the three fields are simply added
389| together to form the result.  This means that any integer portion of `zSig'
390| will be added into the exponent.  Since a properly normalized significand
391| will have an integer portion equal to 1, the `zExp' input should be 1 less
392| than the desired result exponent whenever `zSig' is a complete, normalized
393| significand.
394*----------------------------------------------------------------------------*/
395
396INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
397{
398
399    return make_float64(
400        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
401
402}
403
404/*----------------------------------------------------------------------------
405| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406| and significand `zSig', and returns the proper double-precision floating-
407| point value corresponding to the abstract input.  Ordinarily, the abstract
408| value is simply rounded and packed into the double-precision format, with
409| the inexact exception raised if the abstract input cannot be represented
410| exactly.  However, if the abstract value is too large, the overflow and
411| inexact exceptions are raised and an infinity or maximal finite value is
412| returned.  If the abstract value is too small, the input value is rounded
413| to a subnormal number, and the underflow and inexact exceptions are raised
414| if the abstract input cannot be represented exactly as a subnormal double-
415| precision floating-point number.
416|     The input significand `zSig' has its binary point between bits 62
417| and 61, which is 10 bits to the left of the usual location.  This shifted
418| significand must be normalized or smaller.  If `zSig' is not normalized,
419| `zExp' must be 0; in that case, the result returned is a subnormal number,
420| and it must not require rounding.  In the usual case that `zSig' is
421| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
422| The handling of underflow and overflow follows the IEC/IEEE Standard for
423| Binary Floating-Point Arithmetic.
424*----------------------------------------------------------------------------*/
425
426static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
427{
428    int8 roundingMode;
429    flag roundNearestEven;
430    int16 roundIncrement, roundBits;
431    flag isTiny;
432
433    roundingMode = STATUS(float_rounding_mode);
434    roundNearestEven = ( roundingMode == float_round_nearest_even );
435    roundIncrement = 0x200;
436    if ( ! roundNearestEven ) {
437        if ( roundingMode == float_round_to_zero ) {
438            roundIncrement = 0;
439        }
440        else {
441            roundIncrement = 0x3FF;
442            if ( zSign ) {
443                if ( roundingMode == float_round_up ) roundIncrement = 0;
444            }
445            else {
446                if ( roundingMode == float_round_down ) roundIncrement = 0;
447            }
448        }
449    }
450    roundBits = zSig & 0x3FF;
451    if ( 0x7FD <= (bits16) zExp ) {
452        if (    ( 0x7FD < zExp )
453             || (    ( zExp == 0x7FD )
454                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
455           ) {
456            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
457            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
458        }
459        if ( zExp < 0 ) {
460            isTiny =
461                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
462                || ( zExp < -1 )
463                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
464            shift64RightJamming( zSig, - zExp, &zSig );
465            zExp = 0;
466            roundBits = zSig & 0x3FF;
467            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
468        }
469    }
470    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
471    zSig = ( zSig + roundIncrement )>>10;
472    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
473    if ( zSig == 0 ) zExp = 0;
474    return packFloat64( zSign, zExp, zSig );
475
476}
477
478/*----------------------------------------------------------------------------
479| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
480| and significand `zSig', and returns the proper double-precision floating-
481| point value corresponding to the abstract input.  This routine is just like
482| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
483| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
484| floating-point exponent.
485*----------------------------------------------------------------------------*/
486
487static float64
488 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
489{
490    int8 shiftCount;
491
492    shiftCount = countLeadingZeros64( zSig ) - 1;
493    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
494
495}
496
497#ifdef FLOATX80
498
499/*----------------------------------------------------------------------------
500| Returns the fraction bits of the extended double-precision floating-point
501| value `a'.
502*----------------------------------------------------------------------------*/
503
504INLINE bits64 extractFloatx80Frac( floatx80 a )
505{
506
507    return a.low;
508
509}
510
511/*----------------------------------------------------------------------------
512| Returns the exponent bits of the extended double-precision floating-point
513| value `a'.
514*----------------------------------------------------------------------------*/
515
516INLINE int32 extractFloatx80Exp( floatx80 a )
517{
518
519    return a.high & 0x7FFF;
520
521}
522
523/*----------------------------------------------------------------------------
524| Returns the sign bit of the extended double-precision floating-point value
525| `a'.
526*----------------------------------------------------------------------------*/
527
528INLINE flag extractFloatx80Sign( floatx80 a )
529{
530
531    return a.high>>15;
532
533}
534
535/*----------------------------------------------------------------------------
536| Normalizes the subnormal extended double-precision floating-point value
537| represented by the denormalized significand `aSig'.  The normalized exponent
538| and significand are stored at the locations pointed to by `zExpPtr' and
539| `zSigPtr', respectively.
540*----------------------------------------------------------------------------*/
541
542static void
543 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
544{
545    int8 shiftCount;
546
547    shiftCount = countLeadingZeros64( aSig );
548    *zSigPtr = aSig<<shiftCount;
549    *zExpPtr = 1 - shiftCount;
550
551}
552
553/*----------------------------------------------------------------------------
554| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
555| extended double-precision floating-point value, returning the result.
556*----------------------------------------------------------------------------*/
557
558INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
559{
560    floatx80 z;
561
562    z.low = zSig;
563    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
564    return z;
565
566}
567
568/*----------------------------------------------------------------------------
569| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570| and extended significand formed by the concatenation of `zSig0' and `zSig1',
571| and returns the proper extended double-precision floating-point value
572| corresponding to the abstract input.  Ordinarily, the abstract value is
573| rounded and packed into the extended double-precision format, with the
574| inexact exception raised if the abstract input cannot be represented
575| exactly.  However, if the abstract value is too large, the overflow and
576| inexact exceptions are raised and an infinity or maximal finite value is
577| returned.  If the abstract value is too small, the input value is rounded to
578| a subnormal number, and the underflow and inexact exceptions are raised if
579| the abstract input cannot be represented exactly as a subnormal extended
580| double-precision floating-point number.
581|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
582| number of bits as single or double precision, respectively.  Otherwise, the
583| result is rounded to the full precision of the extended double-precision
584| format.
585|     The input significand must be normalized or smaller.  If the input
586| significand is not normalized, `zExp' must be 0; in that case, the result
587| returned is a subnormal number, and it must not require rounding.  The
588| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
589| Floating-Point Arithmetic.
590*----------------------------------------------------------------------------*/
591
592static floatx80
593 roundAndPackFloatx80(
594     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
595 STATUS_PARAM)
596{
597    int8 roundingMode;
598    flag roundNearestEven, increment, isTiny;
599    int64 roundIncrement, roundMask, roundBits;
600
601    roundingMode = STATUS(float_rounding_mode);
602    roundNearestEven = ( roundingMode == float_round_nearest_even );
603    if ( roundingPrecision == 80 ) goto precision80;
604    if ( roundingPrecision == 64 ) {
605        roundIncrement = LIT64( 0x0000000000000400 );
606        roundMask = LIT64( 0x00000000000007FF );
607    }
608    else if ( roundingPrecision == 32 ) {
609        roundIncrement = LIT64( 0x0000008000000000 );
610        roundMask = LIT64( 0x000000FFFFFFFFFF );
611    }
612    else {
613        goto precision80;
614    }
615    zSig0 |= ( zSig1 != 0 );
616    if ( ! roundNearestEven ) {
617        if ( roundingMode == float_round_to_zero ) {
618            roundIncrement = 0;
619        }
620        else {
621            roundIncrement = roundMask;
622            if ( zSign ) {
623                if ( roundingMode == float_round_up ) roundIncrement = 0;
624            }
625            else {
626                if ( roundingMode == float_round_down ) roundIncrement = 0;
627            }
628        }
629    }
630    roundBits = zSig0 & roundMask;
631    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
632        if (    ( 0x7FFE < zExp )
633             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
634           ) {
635            goto overflow;
636        }
637        if ( zExp <= 0 ) {
638            isTiny =
639                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
640                || ( zExp < 0 )
641                || ( zSig0 <= zSig0 + roundIncrement );
642            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
643            zExp = 0;
644            roundBits = zSig0 & roundMask;
645            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
646            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
647            zSig0 += roundIncrement;
648            if ( (sbits64) zSig0 < 0 ) zExp = 1;
649            roundIncrement = roundMask + 1;
650            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
651                roundMask |= roundIncrement;
652            }
653            zSig0 &= ~ roundMask;
654            return packFloatx80( zSign, zExp, zSig0 );
655        }
656    }
657    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
658    zSig0 += roundIncrement;
659    if ( zSig0 < roundIncrement ) {
660        ++zExp;
661        zSig0 = LIT64( 0x8000000000000000 );
662    }
663    roundIncrement = roundMask + 1;
664    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
665        roundMask |= roundIncrement;
666    }
667    zSig0 &= ~ roundMask;
668    if ( zSig0 == 0 ) zExp = 0;
669    return packFloatx80( zSign, zExp, zSig0 );
670 precision80:
671    increment = ( (sbits64) zSig1 < 0 );
672    if ( ! roundNearestEven ) {
673        if ( roundingMode == float_round_to_zero ) {
674            increment = 0;
675        }
676        else {
677            if ( zSign ) {
678                increment = ( roundingMode == float_round_down ) && zSig1;
679            }
680            else {
681                increment = ( roundingMode == float_round_up ) && zSig1;
682            }
683        }
684    }
685    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686        if (    ( 0x7FFE < zExp )
687             || (    ( zExp == 0x7FFE )
688                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
689                  && increment
690                )
691           ) {
692            roundMask = 0;
693 overflow:
694            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
695            if (    ( roundingMode == float_round_to_zero )
696                 || ( zSign && ( roundingMode == float_round_up ) )
697                 || ( ! zSign && ( roundingMode == float_round_down ) )
698               ) {
699                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
700            }
701            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
702        }
703        if ( zExp <= 0 ) {
704            isTiny =
705                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
706                || ( zExp < 0 )
707                || ! increment
708                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
709            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
710            zExp = 0;
711            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
712            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
713            if ( roundNearestEven ) {
714                increment = ( (sbits64) zSig1 < 0 );
715            }
716            else {
717                if ( zSign ) {
718                    increment = ( roundingMode == float_round_down ) && zSig1;
719                }
720                else {
721                    increment = ( roundingMode == float_round_up ) && zSig1;
722                }
723            }
724            if ( increment ) {
725                ++zSig0;
726                zSig0 &=
727                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
728                if ( (sbits64) zSig0 < 0 ) zExp = 1;
729            }
730            return packFloatx80( zSign, zExp, zSig0 );
731        }
732    }
733    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
734    if ( increment ) {
735        ++zSig0;
736        if ( zSig0 == 0 ) {
737            ++zExp;
738            zSig0 = LIT64( 0x8000000000000000 );
739        }
740        else {
741            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
742        }
743    }
744    else {
745        if ( zSig0 == 0 ) zExp = 0;
746    }
747    return packFloatx80( zSign, zExp, zSig0 );
748
749}
750
751/*----------------------------------------------------------------------------
752| Takes an abstract floating-point value having sign `zSign', exponent
753| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
754| and returns the proper extended double-precision floating-point value
755| corresponding to the abstract input.  This routine is just like
756| `roundAndPackFloatx80' except that the input significand does not have to be
757| normalized.
758*----------------------------------------------------------------------------*/
759
760static floatx80
761 normalizeRoundAndPackFloatx80(
762     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
763 STATUS_PARAM)
764{
765    int8 shiftCount;
766
767    if ( zSig0 == 0 ) {
768        zSig0 = zSig1;
769        zSig1 = 0;
770        zExp -= 64;
771    }
772    shiftCount = countLeadingZeros64( zSig0 );
773    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
774    zExp -= shiftCount;
775    return
776        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
777
778}
779
780#endif
781
782#ifdef FLOAT128
783
784/*----------------------------------------------------------------------------
785| Returns the least-significant 64 fraction bits of the quadruple-precision
786| floating-point value `a'.
787*----------------------------------------------------------------------------*/
788
789INLINE bits64 extractFloat128Frac1( float128 a )
790{
791
792    return a.low;
793
794}
795
796/*----------------------------------------------------------------------------
797| Returns the most-significant 48 fraction bits of the quadruple-precision
798| floating-point value `a'.
799*----------------------------------------------------------------------------*/
800
801INLINE bits64 extractFloat128Frac0( float128 a )
802{
803
804    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
805
806}
807
808/*----------------------------------------------------------------------------
809| Returns the exponent bits of the quadruple-precision floating-point value
810| `a'.
811*----------------------------------------------------------------------------*/
812
813INLINE int32 extractFloat128Exp( float128 a )
814{
815
816    return ( a.high>>48 ) & 0x7FFF;
817
818}
819
820/*----------------------------------------------------------------------------
821| Returns the sign bit of the quadruple-precision floating-point value `a'.
822*----------------------------------------------------------------------------*/
823
824INLINE flag extractFloat128Sign( float128 a )
825{
826
827    return a.high>>63;
828
829}
830
831/*----------------------------------------------------------------------------
832| Normalizes the subnormal quadruple-precision floating-point value
833| represented by the denormalized significand formed by the concatenation of
834| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
835| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
836| significand are stored at the location pointed to by `zSig0Ptr', and the
837| least significant 64 bits of the normalized significand are stored at the
838| location pointed to by `zSig1Ptr'.
839*----------------------------------------------------------------------------*/
840
841static void
842 normalizeFloat128Subnormal(
843     bits64 aSig0,
844     bits64 aSig1,
845     int32 *zExpPtr,
846     bits64 *zSig0Ptr,
847     bits64 *zSig1Ptr
848 )
849{
850    int8 shiftCount;
851
852    if ( aSig0 == 0 ) {
853        shiftCount = countLeadingZeros64( aSig1 ) - 15;
854        if ( shiftCount < 0 ) {
855            *zSig0Ptr = aSig1>>( - shiftCount );
856            *zSig1Ptr = aSig1<<( shiftCount & 63 );
857        }
858        else {
859            *zSig0Ptr = aSig1<<shiftCount;
860            *zSig1Ptr = 0;
861        }
862        *zExpPtr = - shiftCount - 63;
863    }
864    else {
865        shiftCount = countLeadingZeros64( aSig0 ) - 15;
866        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
867        *zExpPtr = 1 - shiftCount;
868    }
869
870}
871
872/*----------------------------------------------------------------------------
873| Packs the sign `zSign', the exponent `zExp', and the significand formed
874| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
875| floating-point value, returning the result.  After being shifted into the
876| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
877| added together to form the most significant 32 bits of the result.  This
878| means that any integer portion of `zSig0' will be added into the exponent.
879| Since a properly normalized significand will have an integer portion equal
880| to 1, the `zExp' input should be 1 less than the desired result exponent
881| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
882| significand.
883*----------------------------------------------------------------------------*/
884
885INLINE float128
886 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
887{
888    float128 z;
889
890    z.low = zSig1;
891    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
892    return z;
893
894}
895
896/*----------------------------------------------------------------------------
897| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
898| and extended significand formed by the concatenation of `zSig0', `zSig1',
899| and `zSig2', and returns the proper quadruple-precision floating-point value
900| corresponding to the abstract input.  Ordinarily, the abstract value is
901| simply rounded and packed into the quadruple-precision format, with the
902| inexact exception raised if the abstract input cannot be represented
903| exactly.  However, if the abstract value is too large, the overflow and
904| inexact exceptions are raised and an infinity or maximal finite value is
905| returned.  If the abstract value is too small, the input value is rounded to
906| a subnormal number, and the underflow and inexact exceptions are raised if
907| the abstract input cannot be represented exactly as a subnormal quadruple-
908| precision floating-point number.
909|     The input significand must be normalized or smaller.  If the input
910| significand is not normalized, `zExp' must be 0; in that case, the result
911| returned is a subnormal number, and it must not require rounding.  In the
912| usual case that the input significand is normalized, `zExp' must be 1 less
913| than the ``true'' floating-point exponent.  The handling of underflow and
914| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
915*----------------------------------------------------------------------------*/
916
917static float128
918 roundAndPackFloat128(
919     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
920{
921    int8 roundingMode;
922    flag roundNearestEven, increment, isTiny;
923
924    roundingMode = STATUS(float_rounding_mode);
925    roundNearestEven = ( roundingMode == float_round_nearest_even );
926    increment = ( (sbits64) zSig2 < 0 );
927    if ( ! roundNearestEven ) {
928        if ( roundingMode == float_round_to_zero ) {
929            increment = 0;
930        }
931        else {
932            if ( zSign ) {
933                increment = ( roundingMode == float_round_down ) && zSig2;
934            }
935            else {
936                increment = ( roundingMode == float_round_up ) && zSig2;
937            }
938        }
939    }
940    if ( 0x7FFD <= (bits32) zExp ) {
941        if (    ( 0x7FFD < zExp )
942             || (    ( zExp == 0x7FFD )
943                  && eq128(
944                         LIT64( 0x0001FFFFFFFFFFFF ),
945                         LIT64( 0xFFFFFFFFFFFFFFFF ),
946                         zSig0,
947                         zSig1
948                     )
949                  && increment
950                )
951           ) {
952            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
953            if (    ( roundingMode == float_round_to_zero )
954                 || ( zSign && ( roundingMode == float_round_up ) )
955                 || ( ! zSign && ( roundingMode == float_round_down ) )
956               ) {
957                return
958                    packFloat128(
959                        zSign,
960                        0x7FFE,
961                        LIT64( 0x0000FFFFFFFFFFFF ),
962                        LIT64( 0xFFFFFFFFFFFFFFFF )
963                    );
964            }
965            return packFloat128( zSign, 0x7FFF, 0, 0 );
966        }
967        if ( zExp < 0 ) {
968            isTiny =
969                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
970                || ( zExp < -1 )
971                || ! increment
972                || lt128(
973                       zSig0,
974                       zSig1,
975                       LIT64( 0x0001FFFFFFFFFFFF ),
976                       LIT64( 0xFFFFFFFFFFFFFFFF )
977                   );
978            shift128ExtraRightJamming(
979                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
980            zExp = 0;
981            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
982            if ( roundNearestEven ) {
983                increment = ( (sbits64) zSig2 < 0 );
984            }
985            else {
986                if ( zSign ) {
987                    increment = ( roundingMode == float_round_down ) && zSig2;
988                }
989                else {
990                    increment = ( roundingMode == float_round_up ) && zSig2;
991                }
992            }
993        }
994    }
995    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
996    if ( increment ) {
997        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
998        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
999    }
1000    else {
1001        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1002    }
1003    return packFloat128( zSign, zExp, zSig0, zSig1 );
1004
1005}
1006
1007/*----------------------------------------------------------------------------
1008| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1009| and significand formed by the concatenation of `zSig0' and `zSig1', and
1010| returns the proper quadruple-precision floating-point value corresponding
1011| to the abstract input.  This routine is just like `roundAndPackFloat128'
1012| except that the input significand has fewer bits and does not have to be
1013| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1014| point exponent.
1015*----------------------------------------------------------------------------*/
1016
1017static float128
1018 normalizeRoundAndPackFloat128(
1019     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1020{
1021    int8 shiftCount;
1022    bits64 zSig2;
1023
1024    if ( zSig0 == 0 ) {
1025        zSig0 = zSig1;
1026        zSig1 = 0;
1027        zExp -= 64;
1028    }
1029    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1030    if ( 0 <= shiftCount ) {
1031        zSig2 = 0;
1032        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1033    }
1034    else {
1035        shift128ExtraRightJamming(
1036            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1037    }
1038    zExp -= shiftCount;
1039    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1040
1041}
1042
1043#endif
1044
1045/*----------------------------------------------------------------------------
1046| Returns the result of converting the 32-bit two's complement integer `a'
1047| to the single-precision floating-point format.  The conversion is performed
1048| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1049*----------------------------------------------------------------------------*/
1050
1051float32 int32_to_float32( int32 a STATUS_PARAM )
1052{
1053    flag zSign;
1054
1055    if ( a == 0 ) return float32_zero;
1056    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1057    zSign = ( a < 0 );
1058    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1059
1060}
1061
1062/*----------------------------------------------------------------------------
1063| Returns the result of converting the 32-bit two's complement integer `a'
1064| to the double-precision floating-point format.  The conversion is performed
1065| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1066*----------------------------------------------------------------------------*/
1067
1068float64 int32_to_float64( int32 a STATUS_PARAM )
1069{
1070    flag zSign;
1071    uint32 absA;
1072    int8 shiftCount;
1073    bits64 zSig;
1074
1075    if ( a == 0 ) return float64_zero;
1076    zSign = ( a < 0 );
1077    absA = zSign ? - a : a;
1078    shiftCount = countLeadingZeros32( absA ) + 21;
1079    zSig = absA;
1080    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1081
1082}
1083
1084#ifdef FLOATX80
1085
1086/*----------------------------------------------------------------------------
1087| Returns the result of converting the 32-bit two's complement integer `a'
1088| to the extended double-precision floating-point format.  The conversion
1089| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1090| Arithmetic.
1091*----------------------------------------------------------------------------*/
1092
1093floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1094{
1095    flag zSign;
1096    uint32 absA;
1097    int8 shiftCount;
1098    bits64 zSig;
1099
1100    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1101    zSign = ( a < 0 );
1102    absA = zSign ? - a : a;
1103    shiftCount = countLeadingZeros32( absA ) + 32;
1104    zSig = absA;
1105    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1106
1107}
1108
1109#endif
1110
1111#ifdef FLOAT128
1112
1113/*----------------------------------------------------------------------------
1114| Returns the result of converting the 32-bit two's complement integer `a' to
1115| the quadruple-precision floating-point format.  The conversion is performed
1116| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117*----------------------------------------------------------------------------*/
1118
1119float128 int32_to_float128( int32 a STATUS_PARAM )
1120{
1121    flag zSign;
1122    uint32 absA;
1123    int8 shiftCount;
1124    bits64 zSig0;
1125
1126    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1127    zSign = ( a < 0 );
1128    absA = zSign ? - a : a;
1129    shiftCount = countLeadingZeros32( absA ) + 17;
1130    zSig0 = absA;
1131    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1132
1133}
1134
1135#endif
1136
1137/*----------------------------------------------------------------------------
1138| Returns the result of converting the 64-bit two's complement integer `a'
1139| to the single-precision floating-point format.  The conversion is performed
1140| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1141*----------------------------------------------------------------------------*/
1142
1143float32 int64_to_float32( int64 a STATUS_PARAM )
1144{
1145    flag zSign;
1146    uint64 absA;
1147    int8 shiftCount;
1148
1149    if ( a == 0 ) return float32_zero;
1150    zSign = ( a < 0 );
1151    absA = zSign ? - a : a;
1152    shiftCount = countLeadingZeros64( absA ) - 40;
1153    if ( 0 <= shiftCount ) {
1154        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1155    }
1156    else {
1157        shiftCount += 7;
1158        if ( shiftCount < 0 ) {
1159            shift64RightJamming( absA, - shiftCount, &absA );
1160        }
1161        else {
1162            absA <<= shiftCount;
1163        }
1164        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1165    }
1166
1167}
1168
1169float32 uint64_to_float32( uint64 a STATUS_PARAM )
1170{
1171    int8 shiftCount;
1172
1173    if ( a == 0 ) return float32_zero;
1174    shiftCount = countLeadingZeros64( a ) - 40;
1175    if ( 0 <= shiftCount ) {
1176        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1177    }
1178    else {
1179        shiftCount += 7;
1180        if ( shiftCount < 0 ) {
1181            shift64RightJamming( a, - shiftCount, &a );
1182        }
1183        else {
1184            a <<= shiftCount;
1185        }
1186        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1187    }
1188}
1189
1190/*----------------------------------------------------------------------------
1191| Returns the result of converting the 64-bit two's complement integer `a'
1192| to the double-precision floating-point format.  The conversion is performed
1193| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1194*----------------------------------------------------------------------------*/
1195
1196float64 int64_to_float64( int64 a STATUS_PARAM )
1197{
1198    flag zSign;
1199
1200    if ( a == 0 ) return float64_zero;
1201    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1202        return packFloat64( 1, 0x43E, 0 );
1203    }
1204    zSign = ( a < 0 );
1205    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1206
1207}
1208
1209float64 uint64_to_float64( uint64 a STATUS_PARAM )
1210{
1211    if ( a == 0 ) return float64_zero;
1212    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1213
1214}
1215
1216#ifdef FLOATX80
1217
1218/*----------------------------------------------------------------------------
1219| Returns the result of converting the 64-bit two's complement integer `a'
1220| to the extended double-precision floating-point format.  The conversion
1221| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1222| Arithmetic.
1223*----------------------------------------------------------------------------*/
1224
1225floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1226{
1227    flag zSign;
1228    uint64 absA;
1229    int8 shiftCount;
1230
1231    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1232    zSign = ( a < 0 );
1233    absA = zSign ? - a : a;
1234    shiftCount = countLeadingZeros64( absA );
1235    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1236
1237}
1238
1239#endif
1240
1241#ifdef FLOAT128
1242
1243/*----------------------------------------------------------------------------
1244| Returns the result of converting the 64-bit two's complement integer `a' to
1245| the quadruple-precision floating-point format.  The conversion is performed
1246| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247*----------------------------------------------------------------------------*/
1248
1249float128 int64_to_float128( int64 a STATUS_PARAM )
1250{
1251    flag zSign;
1252    uint64 absA;
1253    int8 shiftCount;
1254    int32 zExp;
1255    bits64 zSig0, zSig1;
1256
1257    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1258    zSign = ( a < 0 );
1259    absA = zSign ? - a : a;
1260    shiftCount = countLeadingZeros64( absA ) + 49;
1261    zExp = 0x406E - shiftCount;
1262    if ( 64 <= shiftCount ) {
1263        zSig1 = 0;
1264        zSig0 = absA;
1265        shiftCount -= 64;
1266    }
1267    else {
1268        zSig1 = absA;
1269        zSig0 = 0;
1270    }
1271    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1272    return packFloat128( zSign, zExp, zSig0, zSig1 );
1273
1274}
1275
1276#endif
1277
1278/*----------------------------------------------------------------------------
1279| Returns the result of converting the single-precision floating-point value
1280| `a' to the 32-bit two's complement integer format.  The conversion is
1281| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282| Arithmetic---which means in particular that the conversion is rounded
1283| according to the current rounding mode.  If `a' is a NaN, the largest
1284| positive integer is returned.  Otherwise, if the conversion overflows, the
1285| largest integer with the same sign as `a' is returned.
1286*----------------------------------------------------------------------------*/
1287
1288int32 float32_to_int32( float32 a STATUS_PARAM )
1289{
1290    flag aSign;
1291    int16 aExp, shiftCount;
1292    bits32 aSig;
1293    bits64 aSig64;
1294
1295    aSig = extractFloat32Frac( a );
1296    aExp = extractFloat32Exp( a );
1297    aSign = extractFloat32Sign( a );
1298    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1299    if ( aExp ) aSig |= 0x00800000;
1300    shiftCount = 0xAF - aExp;
1301    aSig64 = aSig;
1302    aSig64 <<= 32;
1303    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1304    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1305
1306}
1307
1308/*----------------------------------------------------------------------------
1309| Returns the result of converting the single-precision floating-point value
1310| `a' to the 32-bit two's complement integer format.  The conversion is
1311| performed according to the IEC/IEEE Standard for Binary Floating-Point
1312| Arithmetic, except that the conversion is always rounded toward zero.
1313| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1314| the conversion overflows, the largest integer with the same sign as `a' is
1315| returned.
1316*----------------------------------------------------------------------------*/
1317
1318int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1319{
1320    flag aSign;
1321    int16 aExp, shiftCount;
1322    bits32 aSig;
1323    int32 z;
1324
1325    aSig = extractFloat32Frac( a );
1326    aExp = extractFloat32Exp( a );
1327    aSign = extractFloat32Sign( a );
1328    shiftCount = aExp - 0x9E;
1329    if ( 0 <= shiftCount ) {
1330        if ( float32_val(a) != 0xCF000000 ) {
1331            float_raise( float_flag_invalid STATUS_VAR);
1332            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1333        }
1334        return (sbits32) 0x80000000;
1335    }
1336    else if ( aExp <= 0x7E ) {
1337        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1338        return 0;
1339    }
1340    aSig = ( aSig | 0x00800000 )<<8;
1341    z = aSig>>( - shiftCount );
1342    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1343        STATUS(float_exception_flags) |= float_flag_inexact;
1344    }
1345    if ( aSign ) z = - z;
1346    return z;
1347
1348}
1349
1350/*----------------------------------------------------------------------------
1351| Returns the result of converting the single-precision floating-point value
1352| `a' to the 64-bit two's complement integer format.  The conversion is
1353| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354| Arithmetic---which means in particular that the conversion is rounded
1355| according to the current rounding mode.  If `a' is a NaN, the largest
1356| positive integer is returned.  Otherwise, if the conversion overflows, the
1357| largest integer with the same sign as `a' is returned.
1358*----------------------------------------------------------------------------*/
1359
1360int64 float32_to_int64( float32 a STATUS_PARAM )
1361{
1362    flag aSign;
1363    int16 aExp, shiftCount;
1364    bits32 aSig;
1365    bits64 aSig64, aSigExtra;
1366
1367    aSig = extractFloat32Frac( a );
1368    aExp = extractFloat32Exp( a );
1369    aSign = extractFloat32Sign( a );
1370    shiftCount = 0xBE - aExp;
1371    if ( shiftCount < 0 ) {
1372        float_raise( float_flag_invalid STATUS_VAR);
1373        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1374            return LIT64( 0x7FFFFFFFFFFFFFFF );
1375        }
1376        return (sbits64) LIT64( 0x8000000000000000 );
1377    }
1378    if ( aExp ) aSig |= 0x00800000;
1379    aSig64 = aSig;
1380    aSig64 <<= 40;
1381    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1382    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1383
1384}
1385
1386/*----------------------------------------------------------------------------
1387| Returns the result of converting the single-precision floating-point value
1388| `a' to the 64-bit two's complement integer format.  The conversion is
1389| performed according to the IEC/IEEE Standard for Binary Floating-Point
1390| Arithmetic, except that the conversion is always rounded toward zero.  If
1391| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1392| conversion overflows, the largest integer with the same sign as `a' is
1393| returned.
1394*----------------------------------------------------------------------------*/
1395
1396int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1397{
1398    flag aSign;
1399    int16 aExp, shiftCount;
1400    bits32 aSig;
1401    bits64 aSig64;
1402    int64 z;
1403
1404    aSig = extractFloat32Frac( a );
1405    aExp = extractFloat32Exp( a );
1406    aSign = extractFloat32Sign( a );
1407    shiftCount = aExp - 0xBE;
1408    if ( 0 <= shiftCount ) {
1409        if ( float32_val(a) != 0xDF000000 ) {
1410            float_raise( float_flag_invalid STATUS_VAR);
1411            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1412                return LIT64( 0x7FFFFFFFFFFFFFFF );
1413            }
1414        }
1415        return (sbits64) LIT64( 0x8000000000000000 );
1416    }
1417    else if ( aExp <= 0x7E ) {
1418        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1419        return 0;
1420    }
1421    aSig64 = aSig | 0x00800000;
1422    aSig64 <<= 40;
1423    z = aSig64>>( - shiftCount );
1424    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1425        STATUS(float_exception_flags) |= float_flag_inexact;
1426    }
1427    if ( aSign ) z = - z;
1428    return z;
1429
1430}
1431
1432/*----------------------------------------------------------------------------
1433| Returns the result of converting the single-precision floating-point value
1434| `a' to the double-precision floating-point format.  The conversion is
1435| performed according to the IEC/IEEE Standard for Binary Floating-Point
1436| Arithmetic.
1437*----------------------------------------------------------------------------*/
1438
1439float64 float32_to_float64( float32 a STATUS_PARAM )
1440{
1441    flag aSign;
1442    int16 aExp;
1443    bits32 aSig;
1444
1445    aSig = extractFloat32Frac( a );
1446    aExp = extractFloat32Exp( a );
1447    aSign = extractFloat32Sign( a );
1448    if ( aExp == 0xFF ) {
1449        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1450        return packFloat64( aSign, 0x7FF, 0 );
1451    }
1452    if ( aExp == 0 ) {
1453        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1454        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1455        --aExp;
1456    }
1457    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1458
1459}
1460
1461#ifdef FLOATX80
1462
1463/*----------------------------------------------------------------------------
1464| Returns the result of converting the single-precision floating-point value
1465| `a' to the extended double-precision floating-point format.  The conversion
1466| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1467| Arithmetic.
1468*----------------------------------------------------------------------------*/
1469
1470floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1471{
1472    flag aSign;
1473    int16 aExp;
1474    bits32 aSig;
1475
1476    aSig = extractFloat32Frac( a );
1477    aExp = extractFloat32Exp( a );
1478    aSign = extractFloat32Sign( a );
1479    if ( aExp == 0xFF ) {
1480        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1481        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1482    }
1483    if ( aExp == 0 ) {
1484        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1485        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1486    }
1487    aSig |= 0x00800000;
1488    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1489
1490}
1491
1492#endif
1493
1494#ifdef FLOAT128
1495
1496/*----------------------------------------------------------------------------
1497| Returns the result of converting the single-precision floating-point value
1498| `a' to the double-precision floating-point format.  The conversion is
1499| performed according to the IEC/IEEE Standard for Binary Floating-Point
1500| Arithmetic.
1501*----------------------------------------------------------------------------*/
1502
1503float128 float32_to_float128( float32 a STATUS_PARAM )
1504{
1505    flag aSign;
1506    int16 aExp;
1507    bits32 aSig;
1508
1509    aSig = extractFloat32Frac( a );
1510    aExp = extractFloat32Exp( a );
1511    aSign = extractFloat32Sign( a );
1512    if ( aExp == 0xFF ) {
1513        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1514        return packFloat128( aSign, 0x7FFF, 0, 0 );
1515    }
1516    if ( aExp == 0 ) {
1517        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1518        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519        --aExp;
1520    }
1521    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1522
1523}
1524
1525#endif
1526
1527/*----------------------------------------------------------------------------
1528| Rounds the single-precision floating-point value `a' to an integer, and
1529| returns the result as a single-precision floating-point value.  The
1530| operation is performed according to the IEC/IEEE Standard for Binary
1531| Floating-Point Arithmetic.
1532*----------------------------------------------------------------------------*/
1533
1534float32 float32_round_to_int( float32 a STATUS_PARAM)
1535{
1536    flag aSign;
1537    int16 aExp;
1538    bits32 lastBitMask, roundBitsMask;
1539    int8 roundingMode;
1540    bits32 z;
1541
1542    aExp = extractFloat32Exp( a );
1543    if ( 0x96 <= aExp ) {
1544        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1545            return propagateFloat32NaN( a, a STATUS_VAR );
1546        }
1547        return a;
1548    }
1549    if ( aExp <= 0x7E ) {
1550        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1551        STATUS(float_exception_flags) |= float_flag_inexact;
1552        aSign = extractFloat32Sign( a );
1553        switch ( STATUS(float_rounding_mode) ) {
1554         case float_round_nearest_even:
1555            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1556                return packFloat32( aSign, 0x7F, 0 );
1557            }
1558            break;
1559         case float_round_down:
1560            return make_float32(aSign ? 0xBF800000 : 0);
1561         case float_round_up:
1562            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1563        }
1564        return packFloat32( aSign, 0, 0 );
1565    }
1566    lastBitMask = 1;
1567    lastBitMask <<= 0x96 - aExp;
1568    roundBitsMask = lastBitMask - 1;
1569    z = float32_val(a);
1570    roundingMode = STATUS(float_rounding_mode);
1571    if ( roundingMode == float_round_nearest_even ) {
1572        z += lastBitMask>>1;
1573        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1574    }
1575    else if ( roundingMode != float_round_to_zero ) {
1576        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1577            z += roundBitsMask;
1578        }
1579    }
1580    z &= ~ roundBitsMask;
1581    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1582    return make_float32(z);
1583
1584}
1585
1586/*----------------------------------------------------------------------------
1587| Returns the result of adding the absolute values of the single-precision
1588| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1589| before being returned.  `zSign' is ignored if the result is a NaN.
1590| The addition is performed according to the IEC/IEEE Standard for Binary
1591| Floating-Point Arithmetic.
1592*----------------------------------------------------------------------------*/
1593
1594static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1595{
1596    int16 aExp, bExp, zExp;
1597    bits32 aSig, bSig, zSig;
1598    int16 expDiff;
1599
1600    aSig = extractFloat32Frac( a );
1601    aExp = extractFloat32Exp( a );
1602    bSig = extractFloat32Frac( b );
1603    bExp = extractFloat32Exp( b );
1604    expDiff = aExp - bExp;
1605    aSig <<= 6;
1606    bSig <<= 6;
1607    if ( 0 < expDiff ) {
1608        if ( aExp == 0xFF ) {
1609            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1610            return a;
1611        }
1612        if ( bExp == 0 ) {
1613            --expDiff;
1614        }
1615        else {
1616            bSig |= 0x20000000;
1617        }
1618        shift32RightJamming( bSig, expDiff, &bSig );
1619        zExp = aExp;
1620    }
1621    else if ( expDiff < 0 ) {
1622        if ( bExp == 0xFF ) {
1623            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1624            return packFloat32( zSign, 0xFF, 0 );
1625        }
1626        if ( aExp == 0 ) {
1627            ++expDiff;
1628        }
1629        else {
1630            aSig |= 0x20000000;
1631        }
1632        shift32RightJamming( aSig, - expDiff, &aSig );
1633        zExp = bExp;
1634    }
1635    else {
1636        if ( aExp == 0xFF ) {
1637            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1638            return a;
1639        }
1640        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1641        zSig = 0x40000000 + aSig + bSig;
1642        zExp = aExp;
1643        goto roundAndPack;
1644    }
1645    aSig |= 0x20000000;
1646    zSig = ( aSig + bSig )<<1;
1647    --zExp;
1648    if ( (sbits32) zSig < 0 ) {
1649        zSig = aSig + bSig;
1650        ++zExp;
1651    }
1652 roundAndPack:
1653    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1654
1655}
1656
1657/*----------------------------------------------------------------------------
1658| Returns the result of subtracting the absolute values of the single-
1659| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1660| difference is negated before being returned.  `zSign' is ignored if the
1661| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1662| Standard for Binary Floating-Point Arithmetic.
1663*----------------------------------------------------------------------------*/
1664
1665static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1666{
1667    int16 aExp, bExp, zExp;
1668    bits32 aSig, bSig, zSig;
1669    int16 expDiff;
1670
1671    aSig = extractFloat32Frac( a );
1672    aExp = extractFloat32Exp( a );
1673    bSig = extractFloat32Frac( b );
1674    bExp = extractFloat32Exp( b );
1675    expDiff = aExp - bExp;
1676    aSig <<= 7;
1677    bSig <<= 7;
1678    if ( 0 < expDiff ) goto aExpBigger;
1679    if ( expDiff < 0 ) goto bExpBigger;
1680    if ( aExp == 0xFF ) {
1681        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1682        float_raise( float_flag_invalid STATUS_VAR);
1683        return float32_default_nan;
1684    }
1685    if ( aExp == 0 ) {
1686        aExp = 1;
1687        bExp = 1;
1688    }
1689    if ( bSig < aSig ) goto aBigger;
1690    if ( aSig < bSig ) goto bBigger;
1691    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1692 bExpBigger:
1693    if ( bExp == 0xFF ) {
1694        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1695        return packFloat32( zSign ^ 1, 0xFF, 0 );
1696    }
1697    if ( aExp == 0 ) {
1698        ++expDiff;
1699    }
1700    else {
1701        aSig |= 0x40000000;
1702    }
1703    shift32RightJamming( aSig, - expDiff, &aSig );
1704    bSig |= 0x40000000;
1705 bBigger:
1706    zSig = bSig - aSig;
1707    zExp = bExp;
1708    zSign ^= 1;
1709    goto normalizeRoundAndPack;
1710 aExpBigger:
1711    if ( aExp == 0xFF ) {
1712        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1713        return a;
1714    }
1715    if ( bExp == 0 ) {
1716        --expDiff;
1717    }
1718    else {
1719        bSig |= 0x40000000;
1720    }
1721    shift32RightJamming( bSig, expDiff, &bSig );
1722    aSig |= 0x40000000;
1723 aBigger:
1724    zSig = aSig - bSig;
1725    zExp = aExp;
1726 normalizeRoundAndPack:
1727    --zExp;
1728    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1729
1730}
1731
1732/*----------------------------------------------------------------------------
1733| Returns the result of adding the single-precision floating-point values `a'
1734| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1735| Binary Floating-Point Arithmetic.
1736*----------------------------------------------------------------------------*/
1737
1738float32 float32_add( float32 a, float32 b STATUS_PARAM )
1739{
1740    flag aSign, bSign;
1741
1742    aSign = extractFloat32Sign( a );
1743    bSign = extractFloat32Sign( b );
1744    if ( aSign == bSign ) {
1745        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1746    }
1747    else {
1748        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1749    }
1750
1751}
1752
1753/*----------------------------------------------------------------------------
1754| Returns the result of subtracting the single-precision floating-point values
1755| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1756| for Binary Floating-Point Arithmetic.
1757*----------------------------------------------------------------------------*/
1758
1759float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1760{
1761    flag aSign, bSign;
1762
1763    aSign = extractFloat32Sign( a );
1764    bSign = extractFloat32Sign( b );
1765    if ( aSign == bSign ) {
1766        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1767    }
1768    else {
1769        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1770    }
1771
1772}
1773
1774/*----------------------------------------------------------------------------
1775| Returns the result of multiplying the single-precision floating-point values
1776| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1777| for Binary Floating-Point Arithmetic.
1778*----------------------------------------------------------------------------*/
1779
1780float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1781{
1782    flag aSign, bSign, zSign;
1783    int16 aExp, bExp, zExp;
1784    bits32 aSig, bSig;
1785    bits64 zSig64;
1786    bits32 zSig;
1787
1788    aSig = extractFloat32Frac( a );
1789    aExp = extractFloat32Exp( a );
1790    aSign = extractFloat32Sign( a );
1791    bSig = extractFloat32Frac( b );
1792    bExp = extractFloat32Exp( b );
1793    bSign = extractFloat32Sign( b );
1794    zSign = aSign ^ bSign;
1795    if ( aExp == 0xFF ) {
1796        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1797            return propagateFloat32NaN( a, b STATUS_VAR );
1798        }
1799        if ( ( bExp | bSig ) == 0 ) {
1800            float_raise( float_flag_invalid STATUS_VAR);
1801            return float32_default_nan;
1802        }
1803        return packFloat32( zSign, 0xFF, 0 );
1804    }
1805    if ( bExp == 0xFF ) {
1806        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807        if ( ( aExp | aSig ) == 0 ) {
1808            float_raise( float_flag_invalid STATUS_VAR);
1809            return float32_default_nan;
1810        }
1811        return packFloat32( zSign, 0xFF, 0 );
1812    }
1813    if ( aExp == 0 ) {
1814        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1815        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1816    }
1817    if ( bExp == 0 ) {
1818        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1819        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1820    }
1821    zExp = aExp + bExp - 0x7F;
1822    aSig = ( aSig | 0x00800000 )<<7;
1823    bSig = ( bSig | 0x00800000 )<<8;
1824    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1825    zSig = zSig64;
1826    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1827        zSig <<= 1;
1828        --zExp;
1829    }
1830    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1831
1832}
1833
1834/*----------------------------------------------------------------------------
1835| Returns the result of dividing the single-precision floating-point value `a'
1836| by the corresponding value `b'.  The operation is performed according to the
1837| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1838*----------------------------------------------------------------------------*/
1839
1840float32 float32_div( float32 a, float32 b STATUS_PARAM )
1841{
1842    flag aSign, bSign, zSign;
1843    int16 aExp, bExp, zExp;
1844    bits32 aSig, bSig, zSig;
1845
1846    aSig = extractFloat32Frac( a );
1847    aExp = extractFloat32Exp( a );
1848    aSign = extractFloat32Sign( a );
1849    bSig = extractFloat32Frac( b );
1850    bExp = extractFloat32Exp( b );
1851    bSign = extractFloat32Sign( b );
1852    zSign = aSign ^ bSign;
1853    if ( aExp == 0xFF ) {
1854        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855        if ( bExp == 0xFF ) {
1856            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1857            float_raise( float_flag_invalid STATUS_VAR);
1858            return float32_default_nan;
1859        }
1860        return packFloat32( zSign, 0xFF, 0 );
1861    }
1862    if ( bExp == 0xFF ) {
1863        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864        return packFloat32( zSign, 0, 0 );
1865    }
1866    if ( bExp == 0 ) {
1867        if ( bSig == 0 ) {
1868            if ( ( aExp | aSig ) == 0 ) {
1869                float_raise( float_flag_invalid STATUS_VAR);
1870                return float32_default_nan;
1871            }
1872            float_raise( float_flag_divbyzero STATUS_VAR);
1873            return packFloat32( zSign, 0xFF, 0 );
1874        }
1875        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1876    }
1877    if ( aExp == 0 ) {
1878        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1879        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1880    }
1881    zExp = aExp - bExp + 0x7D;
1882    aSig = ( aSig | 0x00800000 )<<7;
1883    bSig = ( bSig | 0x00800000 )<<8;
1884    if ( bSig <= ( aSig + aSig ) ) {
1885        aSig >>= 1;
1886        ++zExp;
1887    }
1888    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1889    if ( ( zSig & 0x3F ) == 0 ) {
1890        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1891    }
1892    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1893
1894}
1895
1896/*----------------------------------------------------------------------------
1897| Returns the remainder of the single-precision floating-point value `a'
1898| with respect to the corresponding value `b'.  The operation is performed
1899| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1900*----------------------------------------------------------------------------*/
1901
1902float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1903{
1904    flag aSign, bSign, zSign;
1905    int16 aExp, bExp, expDiff;
1906    bits32 aSig, bSig;
1907    bits32 q;
1908    bits64 aSig64, bSig64, q64;
1909    bits32 alternateASig;
1910    sbits32 sigMean;
1911
1912    aSig = extractFloat32Frac( a );
1913    aExp = extractFloat32Exp( a );
1914    aSign = extractFloat32Sign( a );
1915    bSig = extractFloat32Frac( b );
1916    bExp = extractFloat32Exp( b );
1917    bSign = extractFloat32Sign( b );
1918    if ( aExp == 0xFF ) {
1919        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920            return propagateFloat32NaN( a, b STATUS_VAR );
1921        }
1922        float_raise( float_flag_invalid STATUS_VAR);
1923        return float32_default_nan;
1924    }
1925    if ( bExp == 0xFF ) {
1926        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1927        return a;
1928    }
1929    if ( bExp == 0 ) {
1930        if ( bSig == 0 ) {
1931            float_raise( float_flag_invalid STATUS_VAR);
1932            return float32_default_nan;
1933        }
1934        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1935    }
1936    if ( aExp == 0 ) {
1937        if ( aSig == 0 ) return a;
1938        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939    }
1940    expDiff = aExp - bExp;
1941    aSig |= 0x00800000;
1942    bSig |= 0x00800000;
1943    if ( expDiff < 32 ) {
1944        aSig <<= 8;
1945        bSig <<= 8;
1946        if ( expDiff < 0 ) {
1947            if ( expDiff < -1 ) return a;
1948            aSig >>= 1;
1949        }
1950        q = ( bSig <= aSig );
1951        if ( q ) aSig -= bSig;
1952        if ( 0 < expDiff ) {
1953            q = ( ( (bits64) aSig )<<32 ) / bSig;
1954            q >>= 32 - expDiff;
1955            bSig >>= 2;
1956            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1957        }
1958        else {
1959            aSig >>= 2;
1960            bSig >>= 2;
1961        }
1962    }
1963    else {
1964        if ( bSig <= aSig ) aSig -= bSig;
1965        aSig64 = ( (bits64) aSig )<<40;
1966        bSig64 = ( (bits64) bSig )<<40;
1967        expDiff -= 64;
1968        while ( 0 < expDiff ) {
1969            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1970            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1971            aSig64 = - ( ( bSig * q64 )<<38 );
1972            expDiff -= 62;
1973        }
1974        expDiff += 64;
1975        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1976        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1977        q = q64>>( 64 - expDiff );
1978        bSig <<= 6;
1979        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1980    }
1981    do {
1982        alternateASig = aSig;
1983        ++q;
1984        aSig -= bSig;
1985    } while ( 0 <= (sbits32) aSig );
1986    sigMean = aSig + alternateASig;
1987    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1988        aSig = alternateASig;
1989    }
1990    zSign = ( (sbits32) aSig < 0 );
1991    if ( zSign ) aSig = - aSig;
1992    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1993
1994}
1995
1996/*----------------------------------------------------------------------------
1997| Returns the square root of the single-precision floating-point value `a'.
1998| The operation is performed according to the IEC/IEEE Standard for Binary
1999| Floating-Point Arithmetic.
2000*----------------------------------------------------------------------------*/
2001
2002float32 float32_sqrt( float32 a STATUS_PARAM )
2003{
2004    flag aSign;
2005    int16 aExp, zExp;
2006    bits32 aSig, zSig;
2007    bits64 rem, term;
2008
2009    aSig = extractFloat32Frac( a );
2010    aExp = extractFloat32Exp( a );
2011    aSign = extractFloat32Sign( a );
2012    if ( aExp == 0xFF ) {
2013        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2014        if ( ! aSign ) return a;
2015        float_raise( float_flag_invalid STATUS_VAR);
2016        return float32_default_nan;
2017    }
2018    if ( aSign ) {
2019        if ( ( aExp | aSig ) == 0 ) return a;
2020        float_raise( float_flag_invalid STATUS_VAR);
2021        return float32_default_nan;
2022    }
2023    if ( aExp == 0 ) {
2024        if ( aSig == 0 ) return float32_zero;
2025        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2026    }
2027    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2028    aSig = ( aSig | 0x00800000 )<<8;
2029    zSig = estimateSqrt32( aExp, aSig ) + 2;
2030    if ( ( zSig & 0x7F ) <= 5 ) {
2031        if ( zSig < 2 ) {
2032            zSig = 0x7FFFFFFF;
2033            goto roundAndPack;
2034        }
2035        aSig >>= aExp & 1;
2036        term = ( (bits64) zSig ) * zSig;
2037        rem = ( ( (bits64) aSig )<<32 ) - term;
2038        while ( (sbits64) rem < 0 ) {
2039            --zSig;
2040            rem += ( ( (bits64) zSig )<<1 ) | 1;
2041        }
2042        zSig |= ( rem != 0 );
2043    }
2044    shift32RightJamming( zSig, 1, &zSig );
2045 roundAndPack:
2046    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2047
2048}
2049
2050/*----------------------------------------------------------------------------
2051| Returns 1 if the single-precision floating-point value `a' is equal to
2052| the corresponding value `b', and 0 otherwise.  The comparison is performed
2053| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2054*----------------------------------------------------------------------------*/
2055
2056int float32_eq( float32 a, float32 b STATUS_PARAM )
2057{
2058
2059    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2060         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2061       ) {
2062        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2063            float_raise( float_flag_invalid STATUS_VAR);
2064        }
2065        return 0;
2066    }
2067    return ( float32_val(a) == float32_val(b) ) ||
2068            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2069
2070}
2071
2072/*----------------------------------------------------------------------------
2073| Returns 1 if the single-precision floating-point value `a' is less than
2074| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2075| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2076| Arithmetic.
2077*----------------------------------------------------------------------------*/
2078
2079int float32_le( float32 a, float32 b STATUS_PARAM )
2080{
2081    flag aSign, bSign;
2082    bits32 av, bv;
2083
2084    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2085         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2086       ) {
2087        float_raise( float_flag_invalid STATUS_VAR);
2088        return 0;
2089    }
2090    aSign = extractFloat32Sign( a );
2091    bSign = extractFloat32Sign( b );
2092    av = float32_val(a);
2093    bv = float32_val(b);
2094    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2095    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2096
2097}
2098
2099/*----------------------------------------------------------------------------
2100| Returns 1 if the single-precision floating-point value `a' is less than
2101| the corresponding value `b', and 0 otherwise.  The comparison is performed
2102| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2103*----------------------------------------------------------------------------*/
2104
2105int float32_lt( float32 a, float32 b STATUS_PARAM )
2106{
2107    flag aSign, bSign;
2108    bits32 av, bv;
2109
2110    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2111         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2112       ) {
2113        float_raise( float_flag_invalid STATUS_VAR);
2114        return 0;
2115    }
2116    aSign = extractFloat32Sign( a );
2117    bSign = extractFloat32Sign( b );
2118    av = float32_val(a);
2119    bv = float32_val(b);
2120    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2121    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2122
2123}
2124
2125/*----------------------------------------------------------------------------
2126| Returns 1 if the single-precision floating-point value `a' is equal to
2127| the corresponding value `b', and 0 otherwise.  The invalid exception is
2128| raised if either operand is a NaN.  Otherwise, the comparison is performed
2129| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2130*----------------------------------------------------------------------------*/
2131
2132int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2133{
2134    bits32 av, bv;
2135
2136    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2137         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2138       ) {
2139        float_raise( float_flag_invalid STATUS_VAR);
2140        return 0;
2141    }
2142    av = float32_val(a);
2143    bv = float32_val(b);
2144    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2145
2146}
2147
2148/*----------------------------------------------------------------------------
2149| Returns 1 if the single-precision floating-point value `a' is less than or
2150| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2151| cause an exception.  Otherwise, the comparison is performed according to the
2152| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2153*----------------------------------------------------------------------------*/
2154
2155int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2156{
2157    flag aSign, bSign;
2158    bits32 av, bv;
2159
2160    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162       ) {
2163        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2164            float_raise( float_flag_invalid STATUS_VAR);
2165        }
2166        return 0;
2167    }
2168    aSign = extractFloat32Sign( a );
2169    bSign = extractFloat32Sign( b );
2170    av = float32_val(a);
2171    bv = float32_val(b);
2172    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2173    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2174
2175}
2176
2177/*----------------------------------------------------------------------------
2178| Returns 1 if the single-precision floating-point value `a' is less than
2179| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2180| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2181| Standard for Binary Floating-Point Arithmetic.
2182*----------------------------------------------------------------------------*/
2183
2184int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2185{
2186    flag aSign, bSign;
2187    bits32 av, bv;
2188
2189    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2190         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2191       ) {
2192        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2193            float_raise( float_flag_invalid STATUS_VAR);
2194        }
2195        return 0;
2196    }
2197    aSign = extractFloat32Sign( a );
2198    bSign = extractFloat32Sign( b );
2199    av = float32_val(a);
2200    bv = float32_val(b);
2201    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2202    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2203
2204}
2205
2206/*----------------------------------------------------------------------------
2207| Returns the result of converting the double-precision floating-point value
2208| `a' to the 32-bit two's complement integer format.  The conversion is
2209| performed according to the IEC/IEEE Standard for Binary Floating-Point
2210| Arithmetic---which means in particular that the conversion is rounded
2211| according to the current rounding mode.  If `a' is a NaN, the largest
2212| positive integer is returned.  Otherwise, if the conversion overflows, the
2213| largest integer with the same sign as `a' is returned.
2214*----------------------------------------------------------------------------*/
2215
2216int32 float64_to_int32( float64 a STATUS_PARAM )
2217{
2218    flag aSign;
2219    int16 aExp, shiftCount;
2220    bits64 aSig;
2221
2222    aSig = extractFloat64Frac( a );
2223    aExp = extractFloat64Exp( a );
2224    aSign = extractFloat64Sign( a );
2225    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2226    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2227    shiftCount = 0x42C - aExp;
2228    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2229    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2230
2231}
2232
2233/*----------------------------------------------------------------------------
2234| Returns the result of converting the double-precision floating-point value
2235| `a' to the 32-bit two's complement integer format.  The conversion is
2236| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237| Arithmetic, except that the conversion is always rounded toward zero.
2238| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2239| the conversion overflows, the largest integer with the same sign as `a' is
2240| returned.
2241*----------------------------------------------------------------------------*/
2242
2243int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2244{
2245    flag aSign;
2246    int16 aExp, shiftCount;
2247    bits64 aSig, savedASig;
2248    int32 z;
2249
2250    aSig = extractFloat64Frac( a );
2251    aExp = extractFloat64Exp( a );
2252    aSign = extractFloat64Sign( a );
2253    if ( 0x41E < aExp ) {
2254        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2255        goto invalid;
2256    }
2257    else if ( aExp < 0x3FF ) {
2258        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2259        return 0;
2260    }
2261    aSig |= LIT64( 0x0010000000000000 );
2262    shiftCount = 0x433 - aExp;
2263    savedASig = aSig;
2264    aSig >>= shiftCount;
2265    z = aSig;
2266    if ( aSign ) z = - z;
2267    if ( ( z < 0 ) ^ aSign ) {
2268 invalid:
2269        float_raise( float_flag_invalid STATUS_VAR);
2270        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2271    }
2272    if ( ( aSig<<shiftCount ) != savedASig ) {
2273        STATUS(float_exception_flags) |= float_flag_inexact;
2274    }
2275    return z;
2276
2277}
2278
2279/*----------------------------------------------------------------------------
2280| Returns the result of converting the double-precision floating-point value
2281| `a' to the 64-bit two's complement integer format.  The conversion is
2282| performed according to the IEC/IEEE Standard for Binary Floating-Point
2283| Arithmetic---which means in particular that the conversion is rounded
2284| according to the current rounding mode.  If `a' is a NaN, the largest
2285| positive integer is returned.  Otherwise, if the conversion overflows, the
2286| largest integer with the same sign as `a' is returned.
2287*----------------------------------------------------------------------------*/
2288
2289int64 float64_to_int64( float64 a STATUS_PARAM )
2290{
2291    flag aSign;
2292    int16 aExp, shiftCount;
2293    bits64 aSig, aSigExtra;
2294
2295    aSig = extractFloat64Frac( a );
2296    aExp = extractFloat64Exp( a );
2297    aSign = extractFloat64Sign( a );
2298    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2299    shiftCount = 0x433 - aExp;
2300    if ( shiftCount <= 0 ) {
2301        if ( 0x43E < aExp ) {
2302            float_raise( float_flag_invalid STATUS_VAR);
2303            if (    ! aSign
2304                 || (    ( aExp == 0x7FF )
2305                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2306               ) {
2307                return LIT64( 0x7FFFFFFFFFFFFFFF );
2308            }
2309            return (sbits64) LIT64( 0x8000000000000000 );
2310        }
2311        aSigExtra = 0;
2312        aSig <<= - shiftCount;
2313    }
2314    else {
2315        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2316    }
2317    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2318
2319}
2320
2321/*----------------------------------------------------------------------------
2322| Returns the result of converting the double-precision floating-point value
2323| `a' to the 64-bit two's complement integer format.  The conversion is
2324| performed according to the IEC/IEEE Standard for Binary Floating-Point
2325| Arithmetic, except that the conversion is always rounded toward zero.
2326| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2327| the conversion overflows, the largest integer with the same sign as `a' is
2328| returned.
2329*----------------------------------------------------------------------------*/
2330
2331int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2332{
2333    flag aSign;
2334    int16 aExp, shiftCount;
2335    bits64 aSig;
2336    int64 z;
2337
2338    aSig = extractFloat64Frac( a );
2339    aExp = extractFloat64Exp( a );
2340    aSign = extractFloat64Sign( a );
2341    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342    shiftCount = aExp - 0x433;
2343    if ( 0 <= shiftCount ) {
2344        if ( 0x43E <= aExp ) {
2345            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2346                float_raise( float_flag_invalid STATUS_VAR);
2347                if (    ! aSign
2348                     || (    ( aExp == 0x7FF )
2349                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2350                   ) {
2351                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2352                }
2353            }
2354            return (sbits64) LIT64( 0x8000000000000000 );
2355        }
2356        z = aSig<<shiftCount;
2357    }
2358    else {
2359        if ( aExp < 0x3FE ) {
2360            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2361            return 0;
2362        }
2363        z = aSig>>( - shiftCount );
2364        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2365            STATUS(float_exception_flags) |= float_flag_inexact;
2366        }
2367    }
2368    if ( aSign ) z = - z;
2369    return z;
2370
2371}
2372
2373/*----------------------------------------------------------------------------
2374| Returns the result of converting the double-precision floating-point value
2375| `a' to the single-precision floating-point format.  The conversion is
2376| performed according to the IEC/IEEE Standard for Binary Floating-Point
2377| Arithmetic.
2378*----------------------------------------------------------------------------*/
2379
2380float32 float64_to_float32( float64 a STATUS_PARAM )
2381{
2382    flag aSign;
2383    int16 aExp;
2384    bits64 aSig;
2385    bits32 zSig;
2386
2387    aSig = extractFloat64Frac( a );
2388    aExp = extractFloat64Exp( a );
2389    aSign = extractFloat64Sign( a );
2390    if ( aExp == 0x7FF ) {
2391        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2392        return packFloat32( aSign, 0xFF, 0 );
2393    }
2394    shift64RightJamming( aSig, 22, &aSig );
2395    zSig = aSig;
2396    if ( aExp || zSig ) {
2397        zSig |= 0x40000000;
2398        aExp -= 0x381;
2399    }
2400    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2401
2402}
2403
2404#ifdef FLOATX80
2405
2406/*----------------------------------------------------------------------------
2407| Returns the result of converting the double-precision floating-point value
2408| `a' to the extended double-precision floating-point format.  The conversion
2409| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2410| Arithmetic.
2411*----------------------------------------------------------------------------*/
2412
2413floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2414{
2415    flag aSign;
2416    int16 aExp;
2417    bits64 aSig;
2418
2419    aSig = extractFloat64Frac( a );
2420    aExp = extractFloat64Exp( a );
2421    aSign = extractFloat64Sign( a );
2422    if ( aExp == 0x7FF ) {
2423        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2424        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2425    }
2426    if ( aExp == 0 ) {
2427        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2428        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2429    }
2430    return
2431        packFloatx80(
2432            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2433
2434}
2435
2436#endif
2437
2438#ifdef FLOAT128
2439
2440/*----------------------------------------------------------------------------
2441| Returns the result of converting the double-precision floating-point value
2442| `a' to the quadruple-precision floating-point format.  The conversion is
2443| performed according to the IEC/IEEE Standard for Binary Floating-Point
2444| Arithmetic.
2445*----------------------------------------------------------------------------*/
2446
2447float128 float64_to_float128( float64 a STATUS_PARAM )
2448{
2449    flag aSign;
2450    int16 aExp;
2451    bits64 aSig, zSig0, zSig1;
2452
2453    aSig = extractFloat64Frac( a );
2454    aExp = extractFloat64Exp( a );
2455    aSign = extractFloat64Sign( a );
2456    if ( aExp == 0x7FF ) {
2457        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2458        return packFloat128( aSign, 0x7FFF, 0, 0 );
2459    }
2460    if ( aExp == 0 ) {
2461        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2462        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2463        --aExp;
2464    }
2465    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2466    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2467
2468}
2469
2470#endif
2471
2472/*----------------------------------------------------------------------------
2473| Rounds the double-precision floating-point value `a' to an integer, and
2474| returns the result as a double-precision floating-point value.  The
2475| operation is performed according to the IEC/IEEE Standard for Binary
2476| Floating-Point Arithmetic.
2477*----------------------------------------------------------------------------*/
2478
2479float64 float64_round_to_int( float64 a STATUS_PARAM )
2480{
2481    flag aSign;
2482    int16 aExp;
2483    bits64 lastBitMask, roundBitsMask;
2484    int8 roundingMode;
2485    bits64 z;
2486
2487    aExp = extractFloat64Exp( a );
2488    if ( 0x433 <= aExp ) {
2489        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2490            return propagateFloat64NaN( a, a STATUS_VAR );
2491        }
2492        return a;
2493    }
2494    if ( aExp < 0x3FF ) {
2495        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2496        STATUS(float_exception_flags) |= float_flag_inexact;
2497        aSign = extractFloat64Sign( a );
2498        switch ( STATUS(float_rounding_mode) ) {
2499         case float_round_nearest_even:
2500            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2501                return packFloat64( aSign, 0x3FF, 0 );
2502            }
2503            break;
2504         case float_round_down:
2505            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2506         case float_round_up:
2507            return make_float64(
2508            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2509        }
2510        return packFloat64( aSign, 0, 0 );
2511    }
2512    lastBitMask = 1;
2513    lastBitMask <<= 0x433 - aExp;
2514    roundBitsMask = lastBitMask - 1;
2515    z = float64_val(a);
2516    roundingMode = STATUS(float_rounding_mode);
2517    if ( roundingMode == float_round_nearest_even ) {
2518        z += lastBitMask>>1;
2519        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2520    }
2521    else if ( roundingMode != float_round_to_zero ) {
2522        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2523            z += roundBitsMask;
2524        }
2525    }
2526    z &= ~ roundBitsMask;
2527    if ( z != float64_val(a) )
2528        STATUS(float_exception_flags) |= float_flag_inexact;
2529    return make_float64(z);
2530
2531}
2532
2533float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2534{
2535    int oldmode;
2536    float64 res;
2537    oldmode = STATUS(float_rounding_mode);
2538    STATUS(float_rounding_mode) = float_round_to_zero;
2539    res = float64_round_to_int(a STATUS_VAR);
2540    STATUS(float_rounding_mode) = oldmode;
2541    return res;
2542}
2543
2544/*----------------------------------------------------------------------------
2545| Returns the result of adding the absolute values of the double-precision
2546| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2547| before being returned.  `zSign' is ignored if the result is a NaN.
2548| The addition is performed according to the IEC/IEEE Standard for Binary
2549| Floating-Point Arithmetic.
2550*----------------------------------------------------------------------------*/
2551
2552static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2553{
2554    int16 aExp, bExp, zExp;
2555    bits64 aSig, bSig, zSig;
2556    int16 expDiff;
2557
2558    aSig = extractFloat64Frac( a );
2559    aExp = extractFloat64Exp( a );
2560    bSig = extractFloat64Frac( b );
2561    bExp = extractFloat64Exp( b );
2562    expDiff = aExp - bExp;
2563    aSig <<= 9;
2564    bSig <<= 9;
2565    if ( 0 < expDiff ) {
2566        if ( aExp == 0x7FF ) {
2567            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2568            return a;
2569        }
2570        if ( bExp == 0 ) {
2571            --expDiff;
2572        }
2573        else {
2574            bSig |= LIT64( 0x2000000000000000 );
2575        }
2576        shift64RightJamming( bSig, expDiff, &bSig );
2577        zExp = aExp;
2578    }
2579    else if ( expDiff < 0 ) {
2580        if ( bExp == 0x7FF ) {
2581            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582            return packFloat64( zSign, 0x7FF, 0 );
2583        }
2584        if ( aExp == 0 ) {
2585            ++expDiff;
2586        }
2587        else {
2588            aSig |= LIT64( 0x2000000000000000 );
2589        }
2590        shift64RightJamming( aSig, - expDiff, &aSig );
2591        zExp = bExp;
2592    }
2593    else {
2594        if ( aExp == 0x7FF ) {
2595            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2596            return a;
2597        }
2598        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2599        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2600        zExp = aExp;
2601        goto roundAndPack;
2602    }
2603    aSig |= LIT64( 0x2000000000000000 );
2604    zSig = ( aSig + bSig )<<1;
2605    --zExp;
2606    if ( (sbits64) zSig < 0 ) {
2607        zSig = aSig + bSig;
2608        ++zExp;
2609    }
2610 roundAndPack:
2611    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2612
2613}
2614
2615/*----------------------------------------------------------------------------
2616| Returns the result of subtracting the absolute values of the double-
2617| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2618| difference is negated before being returned.  `zSign' is ignored if the
2619| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2620| Standard for Binary Floating-Point Arithmetic.
2621*----------------------------------------------------------------------------*/
2622
2623static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2624{
2625    int16 aExp, bExp, zExp;
2626    bits64 aSig, bSig, zSig;
2627    int16 expDiff;
2628
2629    aSig = extractFloat64Frac( a );
2630    aExp = extractFloat64Exp( a );
2631    bSig = extractFloat64Frac( b );
2632    bExp = extractFloat64Exp( b );
2633    expDiff = aExp - bExp;
2634    aSig <<= 10;
2635    bSig <<= 10;
2636    if ( 0 < expDiff ) goto aExpBigger;
2637    if ( expDiff < 0 ) goto bExpBigger;
2638    if ( aExp == 0x7FF ) {
2639        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2640        float_raise( float_flag_invalid STATUS_VAR);
2641        return float64_default_nan;
2642    }
2643    if ( aExp == 0 ) {
2644        aExp = 1;
2645        bExp = 1;
2646    }
2647    if ( bSig < aSig ) goto aBigger;
2648    if ( aSig < bSig ) goto bBigger;
2649    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2650 bExpBigger:
2651    if ( bExp == 0x7FF ) {
2652        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2653        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2654    }
2655    if ( aExp == 0 ) {
2656        ++expDiff;
2657    }
2658    else {
2659        aSig |= LIT64( 0x4000000000000000 );
2660    }
2661    shift64RightJamming( aSig, - expDiff, &aSig );
2662    bSig |= LIT64( 0x4000000000000000 );
2663 bBigger:
2664    zSig = bSig - aSig;
2665    zExp = bExp;
2666    zSign ^= 1;
2667    goto normalizeRoundAndPack;
2668 aExpBigger:
2669    if ( aExp == 0x7FF ) {
2670        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2671        return a;
2672    }
2673    if ( bExp == 0 ) {
2674        --expDiff;
2675    }
2676    else {
2677        bSig |= LIT64( 0x4000000000000000 );
2678    }
2679    shift64RightJamming( bSig, expDiff, &bSig );
2680    aSig |= LIT64( 0x4000000000000000 );
2681 aBigger:
2682    zSig = aSig - bSig;
2683    zExp = aExp;
2684 normalizeRoundAndPack:
2685    --zExp;
2686    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2687
2688}
2689
2690/*----------------------------------------------------------------------------
2691| Returns the result of adding the double-precision floating-point values `a'
2692| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2693| Binary Floating-Point Arithmetic.
2694*----------------------------------------------------------------------------*/
2695
2696float64 float64_add( float64 a, float64 b STATUS_PARAM )
2697{
2698    flag aSign, bSign;
2699
2700    aSign = extractFloat64Sign( a );
2701    bSign = extractFloat64Sign( b );
2702    if ( aSign == bSign ) {
2703        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2704    }
2705    else {
2706        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2707    }
2708
2709}
2710
2711/*----------------------------------------------------------------------------
2712| Returns the result of subtracting the double-precision floating-point values
2713| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2714| for Binary Floating-Point Arithmetic.
2715*----------------------------------------------------------------------------*/
2716
2717float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2718{
2719    flag aSign, bSign;
2720
2721    aSign = extractFloat64Sign( a );
2722    bSign = extractFloat64Sign( b );
2723    if ( aSign == bSign ) {
2724        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2725    }
2726    else {
2727        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2728    }
2729
2730}
2731
2732/*----------------------------------------------------------------------------
2733| Returns the result of multiplying the double-precision floating-point values
2734| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2735| for Binary Floating-Point Arithmetic.
2736*----------------------------------------------------------------------------*/
2737
2738float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2739{
2740    flag aSign, bSign, zSign;
2741    int16 aExp, bExp, zExp;
2742    bits64 aSig, bSig, zSig0, zSig1;
2743
2744    aSig = extractFloat64Frac( a );
2745    aExp = extractFloat64Exp( a );
2746    aSign = extractFloat64Sign( a );
2747    bSig = extractFloat64Frac( b );
2748    bExp = extractFloat64Exp( b );
2749    bSign = extractFloat64Sign( b );
2750    zSign = aSign ^ bSign;
2751    if ( aExp == 0x7FF ) {
2752        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2753            return propagateFloat64NaN( a, b STATUS_VAR );
2754        }
2755        if ( ( bExp | bSig ) == 0 ) {
2756            float_raise( float_flag_invalid STATUS_VAR);
2757            return float64_default_nan;
2758        }
2759        return packFloat64( zSign, 0x7FF, 0 );
2760    }
2761    if ( bExp == 0x7FF ) {
2762        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2763        if ( ( aExp | aSig ) == 0 ) {
2764            float_raise( float_flag_invalid STATUS_VAR);
2765            return float64_default_nan;
2766        }
2767        return packFloat64( zSign, 0x7FF, 0 );
2768    }
2769    if ( aExp == 0 ) {
2770        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2771        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2772    }
2773    if ( bExp == 0 ) {
2774        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2775        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776    }
2777    zExp = aExp + bExp - 0x3FF;
2778    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2781    zSig0 |= ( zSig1 != 0 );
2782    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2783        zSig0 <<= 1;
2784        --zExp;
2785    }
2786    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2787
2788}
2789
2790/*----------------------------------------------------------------------------
2791| Returns the result of dividing the double-precision floating-point value `a'
2792| by the corresponding value `b'.  The operation is performed according to
2793| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2794*----------------------------------------------------------------------------*/
2795
2796float64 float64_div( float64 a, float64 b STATUS_PARAM )
2797{
2798    flag aSign, bSign, zSign;
2799    int16 aExp, bExp, zExp;
2800    bits64 aSig, bSig, zSig;
2801    bits64 rem0, rem1;
2802    bits64 term0, term1;
2803
2804    aSig = extractFloat64Frac( a );
2805    aExp = extractFloat64Exp( a );
2806    aSign = extractFloat64Sign( a );
2807    bSig = extractFloat64Frac( b );
2808    bExp = extractFloat64Exp( b );
2809    bSign = extractFloat64Sign( b );
2810    zSign = aSign ^ bSign;
2811    if ( aExp == 0x7FF ) {
2812        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2813        if ( bExp == 0x7FF ) {
2814            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2815            float_raise( float_flag_invalid STATUS_VAR);
2816            return float64_default_nan;
2817        }
2818        return packFloat64( zSign, 0x7FF, 0 );
2819    }
2820    if ( bExp == 0x7FF ) {
2821        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822        return packFloat64( zSign, 0, 0 );
2823    }
2824    if ( bExp == 0 ) {
2825        if ( bSig == 0 ) {
2826            if ( ( aExp | aSig ) == 0 ) {
2827                float_raise( float_flag_invalid STATUS_VAR);
2828                return float64_default_nan;
2829            }
2830            float_raise( float_flag_divbyzero STATUS_VAR);
2831            return packFloat64( zSign, 0x7FF, 0 );
2832        }
2833        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834    }
2835    if ( aExp == 0 ) {
2836        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2837        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2838    }
2839    zExp = aExp - bExp + 0x3FD;
2840    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2841    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842    if ( bSig <= ( aSig + aSig ) ) {
2843        aSig >>= 1;
2844        ++zExp;
2845    }
2846    zSig = estimateDiv128To64( aSig, 0, bSig );
2847    if ( ( zSig & 0x1FF ) <= 2 ) {
2848        mul64To128( bSig, zSig, &term0, &term1 );
2849        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2850        while ( (sbits64) rem0 < 0 ) {
2851            --zSig;
2852            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2853        }
2854        zSig |= ( rem1 != 0 );
2855    }
2856    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2857
2858}
2859
2860/*----------------------------------------------------------------------------
2861| Returns the remainder of the double-precision floating-point value `a'
2862| with respect to the corresponding value `b'.  The operation is performed
2863| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864*----------------------------------------------------------------------------*/
2865
2866float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2867{
2868    flag aSign, bSign, zSign;
2869    int16 aExp, bExp, expDiff;
2870    bits64 aSig, bSig;
2871    bits64 q, alternateASig;
2872    sbits64 sigMean;
2873
2874    aSig = extractFloat64Frac( a );
2875    aExp = extractFloat64Exp( a );
2876    aSign = extractFloat64Sign( a );
2877    bSig = extractFloat64Frac( b );
2878    bExp = extractFloat64Exp( b );
2879    bSign = extractFloat64Sign( b );
2880    if ( aExp == 0x7FF ) {
2881        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2882            return propagateFloat64NaN( a, b STATUS_VAR );
2883        }
2884        float_raise( float_flag_invalid STATUS_VAR);
2885        return float64_default_nan;
2886    }
2887    if ( bExp == 0x7FF ) {
2888        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2889        return a;
2890    }
2891    if ( bExp == 0 ) {
2892        if ( bSig == 0 ) {
2893            float_raise( float_flag_invalid STATUS_VAR);
2894            return float64_default_nan;
2895        }
2896        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897    }
2898    if ( aExp == 0 ) {
2899        if ( aSig == 0 ) return a;
2900        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2901    }
2902    expDiff = aExp - bExp;
2903    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2904    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2905    if ( expDiff < 0 ) {
2906        if ( expDiff < -1 ) return a;
2907        aSig >>= 1;
2908    }
2909    q = ( bSig <= aSig );
2910    if ( q ) aSig -= bSig;
2911    expDiff -= 64;
2912    while ( 0 < expDiff ) {
2913        q = estimateDiv128To64( aSig, 0, bSig );
2914        q = ( 2 < q ) ? q - 2 : 0;
2915        aSig = - ( ( bSig>>2 ) * q );
2916        expDiff -= 62;
2917    }
2918    expDiff += 64;
2919    if ( 0 < expDiff ) {
2920        q = estimateDiv128To64( aSig, 0, bSig );
2921        q = ( 2 < q ) ? q - 2 : 0;
2922        q >>= 64 - expDiff;
2923        bSig >>= 2;
2924        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2925    }
2926    else {
2927        aSig >>= 2;
2928        bSig >>= 2;
2929    }
2930    do {
2931        alternateASig = aSig;
2932        ++q;
2933        aSig -= bSig;
2934    } while ( 0 <= (sbits64) aSig );
2935    sigMean = aSig + alternateASig;
2936    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2937        aSig = alternateASig;
2938    }
2939    zSign = ( (sbits64) aSig < 0 );
2940    if ( zSign ) aSig = - aSig;
2941    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2942
2943}
2944
2945/*----------------------------------------------------------------------------
2946| Returns the square root of the double-precision floating-point value `a'.
2947| The operation is performed according to the IEC/IEEE Standard for Binary
2948| Floating-Point Arithmetic.
2949*----------------------------------------------------------------------------*/
2950
2951float64 float64_sqrt( float64 a STATUS_PARAM )
2952{
2953    flag aSign;
2954    int16 aExp, zExp;
2955    bits64 aSig, zSig, doubleZSig;
2956    bits64 rem0, rem1, term0, term1;
2957
2958    aSig = extractFloat64Frac( a );
2959    aExp = extractFloat64Exp( a );
2960    aSign = extractFloat64Sign( a );
2961    if ( aExp == 0x7FF ) {
2962        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2963        if ( ! aSign ) return a;
2964        float_raise( float_flag_invalid STATUS_VAR);
2965        return float64_default_nan;
2966    }
2967    if ( aSign ) {
2968        if ( ( aExp | aSig ) == 0 ) return a;
2969        float_raise( float_flag_invalid STATUS_VAR);
2970        return float64_default_nan;
2971    }
2972    if ( aExp == 0 ) {
2973        if ( aSig == 0 ) return float64_zero;
2974        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975    }
2976    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2977    aSig |= LIT64( 0x0010000000000000 );
2978    zSig = estimateSqrt32( aExp, aSig>>21 );
2979    aSig <<= 9 - ( aExp & 1 );
2980    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2981    if ( ( zSig & 0x1FF ) <= 5 ) {
2982        doubleZSig = zSig<<1;
2983        mul64To128( zSig, zSig, &term0, &term1 );
2984        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2985        while ( (sbits64) rem0 < 0 ) {
2986            --zSig;
2987            doubleZSig -= 2;
2988            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2989        }
2990        zSig |= ( ( rem0 | rem1 ) != 0 );
2991    }
2992    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2993
2994}
2995
2996/*----------------------------------------------------------------------------
2997| Returns 1 if the double-precision floating-point value `a' is equal to the
2998| corresponding value `b', and 0 otherwise.  The comparison is performed
2999| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000*----------------------------------------------------------------------------*/
3001
3002int float64_eq( float64 a, float64 b STATUS_PARAM )
3003{
3004    bits64 av, bv;
3005
3006    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3007         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3008       ) {
3009        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3010            float_raise( float_flag_invalid STATUS_VAR);
3011        }
3012        return 0;
3013    }
3014    av = float64_val(a);
3015    bv = float64_val(b);
3016    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3017
3018}
3019
3020/*----------------------------------------------------------------------------
3021| Returns 1 if the double-precision floating-point value `a' is less than or
3022| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3023| performed according to the IEC/IEEE Standard for Binary Floating-Point
3024| Arithmetic.
3025*----------------------------------------------------------------------------*/
3026
3027int float64_le( float64 a, float64 b STATUS_PARAM )
3028{
3029    flag aSign, bSign;
3030    bits64 av, bv;
3031
3032    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034       ) {
3035        float_raise( float_flag_invalid STATUS_VAR);
3036        return 0;
3037    }
3038    aSign = extractFloat64Sign( a );
3039    bSign = extractFloat64Sign( b );
3040    av = float64_val(a);
3041    bv = float64_val(b);
3042    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3043    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3044
3045}
3046
3047/*----------------------------------------------------------------------------
3048| Returns 1 if the double-precision floating-point value `a' is less than
3049| the corresponding value `b', and 0 otherwise.  The comparison is performed
3050| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3051*----------------------------------------------------------------------------*/
3052
3053int float64_lt( float64 a, float64 b STATUS_PARAM )
3054{
3055    flag aSign, bSign;
3056    bits64 av, bv;
3057
3058    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3060       ) {
3061        float_raise( float_flag_invalid STATUS_VAR);
3062        return 0;
3063    }
3064    aSign = extractFloat64Sign( a );
3065    bSign = extractFloat64Sign( b );
3066    av = float64_val(a);
3067    bv = float64_val(b);
3068    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3069    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3070
3071}
3072
3073/*----------------------------------------------------------------------------
3074| Returns 1 if the double-precision floating-point value `a' is equal to the
3075| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3076| if either operand is a NaN.  Otherwise, the comparison is performed
3077| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3078*----------------------------------------------------------------------------*/
3079
3080int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3081{
3082    bits64 av, bv;
3083
3084    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3085         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3086       ) {
3087        float_raise( float_flag_invalid STATUS_VAR);
3088        return 0;
3089    }
3090    av = float64_val(a);
3091    bv = float64_val(b);
3092    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3093
3094}
3095
3096/*----------------------------------------------------------------------------
3097| Returns 1 if the double-precision floating-point value `a' is less than or
3098| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3099| cause an exception.  Otherwise, the comparison is performed according to the
3100| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3101*----------------------------------------------------------------------------*/
3102
3103int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3104{
3105    flag aSign, bSign;
3106    bits64 av, bv;
3107
3108    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3109         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3110       ) {
3111        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3112            float_raise( float_flag_invalid STATUS_VAR);
3113        }
3114        return 0;
3115    }
3116    aSign = extractFloat64Sign( a );
3117    bSign = extractFloat64Sign( b );
3118    av = float64_val(a);
3119    bv = float64_val(b);
3120    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3121    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3122
3123}
3124
3125/*----------------------------------------------------------------------------
3126| Returns 1 if the double-precision floating-point value `a' is less than
3127| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3128| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3129| Standard for Binary Floating-Point Arithmetic.
3130*----------------------------------------------------------------------------*/
3131
3132int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3133{
3134    flag aSign, bSign;
3135    bits64 av, bv;
3136
3137    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139       ) {
3140        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3141            float_raise( float_flag_invalid STATUS_VAR);
3142        }
3143        return 0;
3144    }
3145    aSign = extractFloat64Sign( a );
3146    bSign = extractFloat64Sign( b );
3147    av = float64_val(a);
3148    bv = float64_val(b);
3149    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3150    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3151
3152}
3153
3154#ifdef FLOATX80
3155
3156/*----------------------------------------------------------------------------
3157| Returns the result of converting the extended double-precision floating-
3158| point value `a' to the 32-bit two's complement integer format.  The
3159| conversion is performed according to the IEC/IEEE Standard for Binary
3160| Floating-Point Arithmetic---which means in particular that the conversion
3161| is rounded according to the current rounding mode.  If `a' is a NaN, the
3162| largest positive integer is returned.  Otherwise, if the conversion
3163| overflows, the largest integer with the same sign as `a' is returned.
3164*----------------------------------------------------------------------------*/
3165
3166int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3167{
3168    flag aSign;
3169    int32 aExp, shiftCount;
3170    bits64 aSig;
3171
3172    aSig = extractFloatx80Frac( a );
3173    aExp = extractFloatx80Exp( a );
3174    aSign = extractFloatx80Sign( a );
3175    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3176    shiftCount = 0x4037 - aExp;
3177    if ( shiftCount <= 0 ) shiftCount = 1;
3178    shift64RightJamming( aSig, shiftCount, &aSig );
3179    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3180
3181}
3182
3183/*----------------------------------------------------------------------------
3184| Returns the result of converting the extended double-precision floating-
3185| point value `a' to the 32-bit two's complement integer format.  The
3186| conversion is performed according to the IEC/IEEE Standard for Binary
3187| Floating-Point Arithmetic, except that the conversion is always rounded
3188| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3189| Otherwise, if the conversion overflows, the largest integer with the same
3190| sign as `a' is returned.
3191*----------------------------------------------------------------------------*/
3192
3193int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3194{
3195    flag aSign;
3196    int32 aExp, shiftCount;
3197    bits64 aSig, savedASig;
3198    int32 z;
3199
3200    aSig = extractFloatx80Frac( a );
3201    aExp = extractFloatx80Exp( a );
3202    aSign = extractFloatx80Sign( a );
3203    if ( 0x401E < aExp ) {
3204        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3205        goto invalid;
3206    }
3207    else if ( aExp < 0x3FFF ) {
3208        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3209        return 0;
3210    }
3211    shiftCount = 0x403E - aExp;
3212    savedASig = aSig;
3213    aSig >>= shiftCount;
3214    z = aSig;
3215    if ( aSign ) z = - z;
3216    if ( ( z < 0 ) ^ aSign ) {
3217 invalid:
3218        float_raise( float_flag_invalid STATUS_VAR);
3219        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3220    }
3221    if ( ( aSig<<shiftCount ) != savedASig ) {
3222        STATUS(float_exception_flags) |= float_flag_inexact;
3223    }
3224    return z;
3225
3226}
3227
3228/*----------------------------------------------------------------------------
3229| Returns the result of converting the extended double-precision floating-
3230| point value `a' to the 64-bit two's complement integer format.  The
3231| conversion is performed according to the IEC/IEEE Standard for Binary
3232| Floating-Point Arithmetic---which means in particular that the conversion
3233| is rounded according to the current rounding mode.  If `a' is a NaN,
3234| the largest positive integer is returned.  Otherwise, if the conversion
3235| overflows, the largest integer with the same sign as `a' is returned.
3236*----------------------------------------------------------------------------*/
3237
3238int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3239{
3240    flag aSign;
3241    int32 aExp, shiftCount;
3242    bits64 aSig, aSigExtra;
3243
3244    aSig = extractFloatx80Frac( a );
3245    aExp = extractFloatx80Exp( a );
3246    aSign = extractFloatx80Sign( a );
3247    shiftCount = 0x403E - aExp;
3248    if ( shiftCount <= 0 ) {
3249        if ( shiftCount ) {
3250            float_raise( float_flag_invalid STATUS_VAR);
3251            if (    ! aSign
3252                 || (    ( aExp == 0x7FFF )
3253                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3254               ) {
3255                return LIT64( 0x7FFFFFFFFFFFFFFF );
3256            }
3257            return (sbits64) LIT64( 0x8000000000000000 );
3258        }
3259        aSigExtra = 0;
3260    }
3261    else {
3262        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3263    }
3264    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3265
3266}
3267
3268/*----------------------------------------------------------------------------
3269| Returns the result of converting the extended double-precision floating-
3270| point value `a' to the 64-bit two's complement integer format.  The
3271| conversion is performed according to the IEC/IEEE Standard for Binary
3272| Floating-Point Arithmetic, except that the conversion is always rounded
3273| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3274| Otherwise, if the conversion overflows, the largest integer with the same
3275| sign as `a' is returned.
3276*----------------------------------------------------------------------------*/
3277
3278int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3279{
3280    flag aSign;
3281    int32 aExp, shiftCount;
3282    bits64 aSig;
3283    int64 z;
3284
3285    aSig = extractFloatx80Frac( a );
3286    aExp = extractFloatx80Exp( a );
3287    aSign = extractFloatx80Sign( a );
3288    shiftCount = aExp - 0x403E;
3289    if ( 0 <= shiftCount ) {
3290        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3291        if ( ( a.high != 0xC03E ) || aSig ) {
3292            float_raise( float_flag_invalid STATUS_VAR);
3293            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3294                return LIT64( 0x7FFFFFFFFFFFFFFF );
3295            }
3296        }
3297        return (sbits64) LIT64( 0x8000000000000000 );
3298    }
3299    else if ( aExp < 0x3FFF ) {
3300        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3301        return 0;
3302    }
3303    z = aSig>>( - shiftCount );
3304    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3305        STATUS(float_exception_flags) |= float_flag_inexact;
3306    }
3307    if ( aSign ) z = - z;
3308    return z;
3309
3310}
3311
3312/*----------------------------------------------------------------------------
3313| Returns the result of converting the extended double-precision floating-
3314| point value `a' to the single-precision floating-point format.  The
3315| conversion is performed according to the IEC/IEEE Standard for Binary
3316| Floating-Point Arithmetic.
3317*----------------------------------------------------------------------------*/
3318
3319float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3320{
3321    flag aSign;
3322    int32 aExp;
3323    bits64 aSig;
3324
3325    aSig = extractFloatx80Frac( a );
3326    aExp = extractFloatx80Exp( a );
3327    aSign = extractFloatx80Sign( a );
3328    if ( aExp == 0x7FFF ) {
3329        if ( (bits64) ( aSig<<1 ) ) {
3330            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3331        }
3332        return packFloat32( aSign, 0xFF, 0 );
3333    }
3334    shift64RightJamming( aSig, 33, &aSig );
3335    if ( aExp || aSig ) aExp -= 0x3F81;
3336    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3337
3338}
3339
3340/*----------------------------------------------------------------------------
3341| Returns the result of converting the extended double-precision floating-
3342| point value `a' to the double-precision floating-point format.  The
3343| conversion is performed according to the IEC/IEEE Standard for Binary
3344| Floating-Point Arithmetic.
3345*----------------------------------------------------------------------------*/
3346
3347float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3348{
3349    flag aSign;
3350    int32 aExp;
3351    bits64 aSig, zSig;
3352
3353    aSig = extractFloatx80Frac( a );
3354    aExp = extractFloatx80Exp( a );
3355    aSign = extractFloatx80Sign( a );
3356    if ( aExp == 0x7FFF ) {
3357        if ( (bits64) ( aSig<<1 ) ) {
3358            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3359        }
3360        return packFloat64( aSign, 0x7FF, 0 );
3361    }
3362    shift64RightJamming( aSig, 1, &zSig );
3363    if ( aExp || aSig ) aExp -= 0x3C01;
3364    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3365
3366}
3367
3368#ifdef FLOAT128
3369
3370/*----------------------------------------------------------------------------
3371| Returns the result of converting the extended double-precision floating-
3372| point value `a' to the quadruple-precision floating-point format.  The
3373| conversion is performed according to the IEC/IEEE Standard for Binary
3374| Floating-Point Arithmetic.
3375*----------------------------------------------------------------------------*/
3376
3377float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3378{
3379    flag aSign;
3380    int16 aExp;
3381    bits64 aSig, zSig0, zSig1;
3382
3383    aSig = extractFloatx80Frac( a );
3384    aExp = extractFloatx80Exp( a );
3385    aSign = extractFloatx80Sign( a );
3386    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3387        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3388    }
3389    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3390    return packFloat128( aSign, aExp, zSig0, zSig1 );
3391
3392}
3393
3394#endif
3395
3396/*----------------------------------------------------------------------------
3397| Rounds the extended double-precision floating-point value `a' to an integer,
3398| and returns the result as an extended quadruple-precision floating-point
3399| value.  The operation is performed according to the IEC/IEEE Standard for
3400| Binary Floating-Point Arithmetic.
3401*----------------------------------------------------------------------------*/
3402
3403floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3404{
3405    flag aSign;
3406    int32 aExp;
3407    bits64 lastBitMask, roundBitsMask;
3408    int8 roundingMode;
3409    floatx80 z;
3410
3411    aExp = extractFloatx80Exp( a );
3412    if ( 0x403E <= aExp ) {
3413        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3414            return propagateFloatx80NaN( a, a STATUS_VAR );
3415        }
3416        return a;
3417    }
3418    if ( aExp < 0x3FFF ) {
3419        if (    ( aExp == 0 )
3420             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3421            return a;
3422        }
3423        STATUS(float_exception_flags) |= float_flag_inexact;
3424        aSign = extractFloatx80Sign( a );
3425        switch ( STATUS(float_rounding_mode) ) {
3426         case float_round_nearest_even:
3427            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3428               ) {
3429                return
3430                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3431            }
3432            break;
3433         case float_round_down:
3434            return
3435                  aSign ?
3436                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3437                : packFloatx80( 0, 0, 0 );
3438         case float_round_up:
3439            return
3440                  aSign ? packFloatx80( 1, 0, 0 )
3441                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3442        }
3443        return packFloatx80( aSign, 0, 0 );
3444    }
3445    lastBitMask = 1;
3446    lastBitMask <<= 0x403E - aExp;
3447    roundBitsMask = lastBitMask - 1;
3448    z = a;
3449    roundingMode = STATUS(float_rounding_mode);
3450    if ( roundingMode == float_round_nearest_even ) {
3451        z.low += lastBitMask>>1;
3452        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3453    }
3454    else if ( roundingMode != float_round_to_zero ) {
3455        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3456            z.low += roundBitsMask;
3457        }
3458    }
3459    z.low &= ~ roundBitsMask;
3460    if ( z.low == 0 ) {
3461        ++z.high;
3462        z.low = LIT64( 0x8000000000000000 );
3463    }
3464    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3465    return z;
3466
3467}
3468
3469/*----------------------------------------------------------------------------
3470| Returns the result of adding the absolute values of the extended double-
3471| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3472| negated before being returned.  `zSign' is ignored if the result is a NaN.
3473| The addition is performed according to the IEC/IEEE Standard for Binary
3474| Floating-Point Arithmetic.
3475*----------------------------------------------------------------------------*/
3476
3477static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3478{
3479    int32 aExp, bExp, zExp;
3480    bits64 aSig, bSig, zSig0, zSig1;
3481    int32 expDiff;
3482
3483    aSig = extractFloatx80Frac( a );
3484    aExp = extractFloatx80Exp( a );
3485    bSig = extractFloatx80Frac( b );
3486    bExp = extractFloatx80Exp( b );
3487    expDiff = aExp - bExp;
3488    if ( 0 < expDiff ) {
3489        if ( aExp == 0x7FFF ) {
3490            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3491            return a;
3492        }
3493        if ( bExp == 0 ) --expDiff;
3494        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3495        zExp = aExp;
3496    }
3497    else if ( expDiff < 0 ) {
3498        if ( bExp == 0x7FFF ) {
3499            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3500            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3501        }
3502        if ( aExp == 0 ) ++expDiff;
3503        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3504        zExp = bExp;
3505    }
3506    else {
3507        if ( aExp == 0x7FFF ) {
3508            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3509                return propagateFloatx80NaN( a, b STATUS_VAR );
3510            }
3511            return a;
3512        }
3513        zSig1 = 0;
3514        zSig0 = aSig + bSig;
3515        if ( aExp == 0 ) {
3516            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3517            goto roundAndPack;
3518        }
3519        zExp = aExp;
3520        goto shiftRight1;
3521    }
3522    zSig0 = aSig + bSig;
3523    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3524 shiftRight1:
3525    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3526    zSig0 |= LIT64( 0x8000000000000000 );
3527    ++zExp;
3528 roundAndPack:
3529    return
3530        roundAndPackFloatx80(
3531            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3532
3533}
3534
3535/*----------------------------------------------------------------------------
3536| Returns the result of subtracting the absolute values of the extended
3537| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3538| difference is negated before being returned.  `zSign' is ignored if the
3539| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3540| Standard for Binary Floating-Point Arithmetic.
3541*----------------------------------------------------------------------------*/
3542
3543static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3544{
3545    int32 aExp, bExp, zExp;
3546    bits64 aSig, bSig, zSig0, zSig1;
3547    int32 expDiff;
3548    floatx80 z;
3549
3550    aSig = extractFloatx80Frac( a );
3551    aExp = extractFloatx80Exp( a );
3552    bSig = extractFloatx80Frac( b );
3553    bExp = extractFloatx80Exp( b );
3554    expDiff = aExp - bExp;
3555    if ( 0 < expDiff ) goto aExpBigger;
3556    if ( expDiff < 0 ) goto bExpBigger;
3557    if ( aExp == 0x7FFF ) {
3558        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3559            return propagateFloatx80NaN( a, b STATUS_VAR );
3560        }
3561        float_raise( float_flag_invalid STATUS_VAR);
3562        z.low = floatx80_default_nan_low;
3563        z.high = floatx80_default_nan_high;
3564        return z;
3565    }
3566    if ( aExp == 0 ) {
3567        aExp = 1;
3568        bExp = 1;
3569    }
3570    zSig1 = 0;
3571    if ( bSig < aSig ) goto aBigger;
3572    if ( aSig < bSig ) goto bBigger;
3573    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3574 bExpBigger:
3575    if ( bExp == 0x7FFF ) {
3576        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3577        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3578    }
3579    if ( aExp == 0 ) ++expDiff;
3580    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3581 bBigger:
3582    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3583    zExp = bExp;
3584    zSign ^= 1;
3585    goto normalizeRoundAndPack;
3586 aExpBigger:
3587    if ( aExp == 0x7FFF ) {
3588        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589        return a;
3590    }
3591    if ( bExp == 0 ) --expDiff;
3592    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3593 aBigger:
3594    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3595    zExp = aExp;
3596 normalizeRoundAndPack:
3597    return
3598        normalizeRoundAndPackFloatx80(
3599            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3600
3601}
3602
3603/*----------------------------------------------------------------------------
3604| Returns the result of adding the extended double-precision floating-point
3605| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3606| Standard for Binary Floating-Point Arithmetic.
3607*----------------------------------------------------------------------------*/
3608
3609floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3610{
3611    flag aSign, bSign;
3612
3613    aSign = extractFloatx80Sign( a );
3614    bSign = extractFloatx80Sign( b );
3615    if ( aSign == bSign ) {
3616        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3617    }
3618    else {
3619        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3620    }
3621
3622}
3623
3624/*----------------------------------------------------------------------------
3625| Returns the result of subtracting the extended double-precision floating-
3626| point values `a' and `b'.  The operation is performed according to the
3627| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628*----------------------------------------------------------------------------*/
3629
3630floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3631{
3632    flag aSign, bSign;
3633
3634    aSign = extractFloatx80Sign( a );
3635    bSign = extractFloatx80Sign( b );
3636    if ( aSign == bSign ) {
3637        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3638    }
3639    else {
3640        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3641    }
3642
3643}
3644
3645/*----------------------------------------------------------------------------
3646| Returns the result of multiplying the extended double-precision floating-
3647| point values `a' and `b'.  The operation is performed according to the
3648| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3649*----------------------------------------------------------------------------*/
3650
3651floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3652{
3653    flag aSign, bSign, zSign;
3654    int32 aExp, bExp, zExp;
3655    bits64 aSig, bSig, zSig0, zSig1;
3656    floatx80 z;
3657
3658    aSig = extractFloatx80Frac( a );
3659    aExp = extractFloatx80Exp( a );
3660    aSign = extractFloatx80Sign( a );
3661    bSig = extractFloatx80Frac( b );
3662    bExp = extractFloatx80Exp( b );
3663    bSign = extractFloatx80Sign( b );
3664    zSign = aSign ^ bSign;
3665    if ( aExp == 0x7FFF ) {
3666        if (    (bits64) ( aSig<<1 )
3667             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3668            return propagateFloatx80NaN( a, b STATUS_VAR );
3669        }
3670        if ( ( bExp | bSig ) == 0 ) goto invalid;
3671        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672    }
3673    if ( bExp == 0x7FFF ) {
3674        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3675        if ( ( aExp | aSig ) == 0 ) {
3676 invalid:
3677            float_raise( float_flag_invalid STATUS_VAR);
3678            z.low = floatx80_default_nan_low;
3679            z.high = floatx80_default_nan_high;
3680            return z;
3681        }
3682        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683    }
3684    if ( aExp == 0 ) {
3685        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3686        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3687    }
3688    if ( bExp == 0 ) {
3689        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3690        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3691    }
3692    zExp = aExp + bExp - 0x3FFE;
3693    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3694    if ( 0 < (sbits64) zSig0 ) {
3695        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3696        --zExp;
3697    }
3698    return
3699        roundAndPackFloatx80(
3700            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3701
3702}
3703
3704/*----------------------------------------------------------------------------
3705| Returns the result of dividing the extended double-precision floating-point
3706| value `a' by the corresponding value `b'.  The operation is performed
3707| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708*----------------------------------------------------------------------------*/
3709
3710floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3711{
3712    flag aSign, bSign, zSign;
3713    int32 aExp, bExp, zExp;
3714    bits64 aSig, bSig, zSig0, zSig1;
3715    bits64 rem0, rem1, rem2, term0, term1, term2;
3716    floatx80 z;
3717
3718    aSig = extractFloatx80Frac( a );
3719    aExp = extractFloatx80Exp( a );
3720    aSign = extractFloatx80Sign( a );
3721    bSig = extractFloatx80Frac( b );
3722    bExp = extractFloatx80Exp( b );
3723    bSign = extractFloatx80Sign( b );
3724    zSign = aSign ^ bSign;
3725    if ( aExp == 0x7FFF ) {
3726        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3727        if ( bExp == 0x7FFF ) {
3728            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3729            goto invalid;
3730        }
3731        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3732    }
3733    if ( bExp == 0x7FFF ) {
3734        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3735        return packFloatx80( zSign, 0, 0 );
3736    }
3737    if ( bExp == 0 ) {
3738        if ( bSig == 0 ) {
3739            if ( ( aExp | aSig ) == 0 ) {
3740 invalid:
3741                float_raise( float_flag_invalid STATUS_VAR);
3742                z.low = floatx80_default_nan_low;
3743                z.high = floatx80_default_nan_high;
3744                return z;
3745            }
3746            float_raise( float_flag_divbyzero STATUS_VAR);
3747            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3748        }
3749        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3750    }
3751    if ( aExp == 0 ) {
3752        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3753        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3754    }
3755    zExp = aExp - bExp + 0x3FFE;
3756    rem1 = 0;
3757    if ( bSig <= aSig ) {
3758        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3759        ++zExp;
3760    }
3761    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3762    mul64To128( bSig, zSig0, &term0, &term1 );
3763    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3764    while ( (sbits64) rem0 < 0 ) {
3765        --zSig0;
3766        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3767    }
3768    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3769    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3770        mul64To128( bSig, zSig1, &term1, &term2 );
3771        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3772        while ( (sbits64) rem1 < 0 ) {
3773            --zSig1;
3774            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3775        }
3776        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3777    }
3778    return
3779        roundAndPackFloatx80(
3780            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3781
3782}
3783
3784/*----------------------------------------------------------------------------
3785| Returns the remainder of the extended double-precision floating-point value
3786| `a' with respect to the corresponding value `b'.  The operation is performed
3787| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788*----------------------------------------------------------------------------*/
3789
3790floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3791{
3792    flag aSign, bSign, zSign;
3793    int32 aExp, bExp, expDiff;
3794    bits64 aSig0, aSig1, bSig;
3795    bits64 q, term0, term1, alternateASig0, alternateASig1;
3796    floatx80 z;
3797
3798    aSig0 = extractFloatx80Frac( a );
3799    aExp = extractFloatx80Exp( a );
3800    aSign = extractFloatx80Sign( a );
3801    bSig = extractFloatx80Frac( b );
3802    bExp = extractFloatx80Exp( b );
3803    bSign = extractFloatx80Sign( b );
3804    if ( aExp == 0x7FFF ) {
3805        if (    (bits64) ( aSig0<<1 )
3806             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807            return propagateFloatx80NaN( a, b STATUS_VAR );
3808        }
3809        goto invalid;
3810    }
3811    if ( bExp == 0x7FFF ) {
3812        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3813        return a;
3814    }
3815    if ( bExp == 0 ) {
3816        if ( bSig == 0 ) {
3817 invalid:
3818            float_raise( float_flag_invalid STATUS_VAR);
3819            z.low = floatx80_default_nan_low;
3820            z.high = floatx80_default_nan_high;
3821            return z;
3822        }
3823        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3824    }
3825    if ( aExp == 0 ) {
3826        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3827        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3828    }
3829    bSig |= LIT64( 0x8000000000000000 );
3830    zSign = aSign;
3831    expDiff = aExp - bExp;
3832    aSig1 = 0;
3833    if ( expDiff < 0 ) {
3834        if ( expDiff < -1 ) return a;
3835        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3836        expDiff = 0;
3837    }
3838    q = ( bSig <= aSig0 );
3839    if ( q ) aSig0 -= bSig;
3840    expDiff -= 64;
3841    while ( 0 < expDiff ) {
3842        q = estimateDiv128To64( aSig0, aSig1, bSig );
3843        q = ( 2 < q ) ? q - 2 : 0;
3844        mul64To128( bSig, q, &term0, &term1 );
3845        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3846        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3847        expDiff -= 62;
3848    }
3849    expDiff += 64;
3850    if ( 0 < expDiff ) {
3851        q = estimateDiv128To64( aSig0, aSig1, bSig );
3852        q = ( 2 < q ) ? q - 2 : 0;
3853        q >>= 64 - expDiff;
3854        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3855        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3856        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3857        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3858            ++q;
3859            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3860        }
3861    }
3862    else {
3863        term1 = 0;
3864        term0 = bSig;
3865    }
3866    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3867    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3868         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3869              && ( q & 1 ) )
3870       ) {
3871        aSig0 = alternateASig0;
3872        aSig1 = alternateASig1;
3873        zSign = ! zSign;
3874    }
3875    return
3876        normalizeRoundAndPackFloatx80(
3877            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3878
3879}
3880
3881/*----------------------------------------------------------------------------
3882| Returns the square root of the extended double-precision floating-point
3883| value `a'.  The operation is performed according to the IEC/IEEE Standard
3884| for Binary Floating-Point Arithmetic.
3885*----------------------------------------------------------------------------*/
3886
3887floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3888{
3889    flag aSign;
3890    int32 aExp, zExp;
3891    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3892    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3893    floatx80 z;
3894
3895    aSig0 = extractFloatx80Frac( a );
3896    aExp = extractFloatx80Exp( a );
3897    aSign = extractFloatx80Sign( a );
3898    if ( aExp == 0x7FFF ) {
3899        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3900        if ( ! aSign ) return a;
3901        goto invalid;
3902    }
3903    if ( aSign ) {
3904        if ( ( aExp | aSig0 ) == 0 ) return a;
3905 invalid:
3906        float_raise( float_flag_invalid STATUS_VAR);
3907        z.low = floatx80_default_nan_low;
3908        z.high = floatx80_default_nan_high;
3909        return z;
3910    }
3911    if ( aExp == 0 ) {
3912        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3913        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3914    }
3915    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3916    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3917    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3918    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3919    doubleZSig0 = zSig0<<1;
3920    mul64To128( zSig0, zSig0, &term0, &term1 );
3921    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3922    while ( (sbits64) rem0 < 0 ) {
3923        --zSig0;
3924        doubleZSig0 -= 2;
3925        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3926    }
3927    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3928    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3929        if ( zSig1 == 0 ) zSig1 = 1;
3930        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3931        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3932        mul64To128( zSig1, zSig1, &term2, &term3 );
3933        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3934        while ( (sbits64) rem1 < 0 ) {
3935            --zSig1;
3936            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3937            term3 |= 1;
3938            term2 |= doubleZSig0;
3939            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3940        }
3941        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3942    }
3943    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3944    zSig0 |= doubleZSig0;
3945    return
3946        roundAndPackFloatx80(
3947            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3948
3949}
3950
3951/*----------------------------------------------------------------------------
3952| Returns 1 if the extended double-precision floating-point value `a' is
3953| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3954| performed according to the IEC/IEEE Standard for Binary Floating-Point
3955| Arithmetic.
3956*----------------------------------------------------------------------------*/
3957
3958int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3959{
3960
3961    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3962              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3963         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3964              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3965       ) {
3966        if (    floatx80_is_signaling_nan( a )
3967             || floatx80_is_signaling_nan( b ) ) {
3968            float_raise( float_flag_invalid STATUS_VAR);
3969        }
3970        return 0;
3971    }
3972    return
3973           ( a.low == b.low )
3974        && (    ( a.high == b.high )
3975             || (    ( a.low == 0 )
3976                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3977           );
3978
3979}
3980
3981/*----------------------------------------------------------------------------
3982| Returns 1 if the extended double-precision floating-point value `a' is
3983| less than or equal to the corresponding value `b', and 0 otherwise.  The
3984| comparison is performed according to the IEC/IEEE Standard for Binary
3985| Floating-Point Arithmetic.
3986*----------------------------------------------------------------------------*/
3987
3988int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3989{
3990    flag aSign, bSign;
3991
3992    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3993              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3995              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996       ) {
3997        float_raise( float_flag_invalid STATUS_VAR);
3998        return 0;
3999    }
4000    aSign = extractFloatx80Sign( a );
4001    bSign = extractFloatx80Sign( b );
4002    if ( aSign != bSign ) {
4003        return
4004               aSign
4005            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4006                 == 0 );
4007    }
4008    return
4009          aSign ? le128( b.high, b.low, a.high, a.low )
4010        : le128( a.high, a.low, b.high, b.low );
4011
4012}
4013
4014/*----------------------------------------------------------------------------
4015| Returns 1 if the extended double-precision floating-point value `a' is
4016| less than the corresponding value `b', and 0 otherwise.  The comparison
4017| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4018| Arithmetic.
4019*----------------------------------------------------------------------------*/
4020
4021int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4022{
4023    flag aSign, bSign;
4024
4025    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4026              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4027         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4028              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4029       ) {
4030        float_raise( float_flag_invalid STATUS_VAR);
4031        return 0;
4032    }
4033    aSign = extractFloatx80Sign( a );
4034    bSign = extractFloatx80Sign( b );
4035    if ( aSign != bSign ) {
4036        return
4037               aSign
4038            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4039                 != 0 );
4040    }
4041    return
4042          aSign ? lt128( b.high, b.low, a.high, a.low )
4043        : lt128( a.high, a.low, b.high, b.low );
4044
4045}
4046
4047/*----------------------------------------------------------------------------
4048| Returns 1 if the extended double-precision floating-point value `a' is equal
4049| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4050| raised if either operand is a NaN.  Otherwise, the comparison is performed
4051| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052*----------------------------------------------------------------------------*/
4053
4054int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4055{
4056
4057    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4058              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4059         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4060              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4061       ) {
4062        float_raise( float_flag_invalid STATUS_VAR);
4063        return 0;
4064    }
4065    return
4066           ( a.low == b.low )
4067        && (    ( a.high == b.high )
4068             || (    ( a.low == 0 )
4069                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4070           );
4071
4072}
4073
4074/*----------------------------------------------------------------------------
4075| Returns 1 if the extended double-precision floating-point value `a' is less
4076| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4077| do not cause an exception.  Otherwise, the comparison is performed according
4078| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079*----------------------------------------------------------------------------*/
4080
4081int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4082{
4083    flag aSign, bSign;
4084
4085    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4086              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4087         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4088              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4089       ) {
4090        if (    floatx80_is_signaling_nan( a )
4091             || floatx80_is_signaling_nan( b ) ) {
4092            float_raise( float_flag_invalid STATUS_VAR);
4093        }
4094        return 0;
4095    }
4096    aSign = extractFloatx80Sign( a );
4097    bSign = extractFloatx80Sign( b );
4098    if ( aSign != bSign ) {
4099        return
4100               aSign
4101            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4102                 == 0 );
4103    }
4104    return
4105          aSign ? le128( b.high, b.low, a.high, a.low )
4106        : le128( a.high, a.low, b.high, b.low );
4107
4108}
4109
4110/*----------------------------------------------------------------------------
4111| Returns 1 if the extended double-precision floating-point value `a' is less
4112| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4113| an exception.  Otherwise, the comparison is performed according to the
4114| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115*----------------------------------------------------------------------------*/
4116
4117int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4118{
4119    flag aSign, bSign;
4120
4121    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4122              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4123         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4124              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4125       ) {
4126        if (    floatx80_is_signaling_nan( a )
4127             || floatx80_is_signaling_nan( b ) ) {
4128            float_raise( float_flag_invalid STATUS_VAR);
4129        }
4130        return 0;
4131    }
4132    aSign = extractFloatx80Sign( a );
4133    bSign = extractFloatx80Sign( b );
4134    if ( aSign != bSign ) {
4135        return
4136               aSign
4137            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4138                 != 0 );
4139    }
4140    return
4141          aSign ? lt128( b.high, b.low, a.high, a.low )
4142        : lt128( a.high, a.low, b.high, b.low );
4143
4144}
4145
4146#endif
4147
4148#ifdef FLOAT128
4149
4150/*----------------------------------------------------------------------------
4151| Returns the result of converting the quadruple-precision floating-point
4152| value `a' to the 32-bit two's complement integer format.  The conversion
4153| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4154| Arithmetic---which means in particular that the conversion is rounded
4155| according to the current rounding mode.  If `a' is a NaN, the largest
4156| positive integer is returned.  Otherwise, if the conversion overflows, the
4157| largest integer with the same sign as `a' is returned.
4158*----------------------------------------------------------------------------*/
4159
4160int32 float128_to_int32( float128 a STATUS_PARAM )
4161{
4162    flag aSign;
4163    int32 aExp, shiftCount;
4164    bits64 aSig0, aSig1;
4165
4166    aSig1 = extractFloat128Frac1( a );
4167    aSig0 = extractFloat128Frac0( a );
4168    aExp = extractFloat128Exp( a );
4169    aSign = extractFloat128Sign( a );
4170    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4171    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172    aSig0 |= ( aSig1 != 0 );
4173    shiftCount = 0x4028 - aExp;
4174    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4175    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4176
4177}
4178
4179/*----------------------------------------------------------------------------
4180| Returns the result of converting the quadruple-precision floating-point
4181| value `a' to the 32-bit two's complement integer format.  The conversion
4182| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4183| Arithmetic, except that the conversion is always rounded toward zero.  If
4184| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4185| conversion overflows, the largest integer with the same sign as `a' is
4186| returned.
4187*----------------------------------------------------------------------------*/
4188
4189int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4190{
4191    flag aSign;
4192    int32 aExp, shiftCount;
4193    bits64 aSig0, aSig1, savedASig;
4194    int32 z;
4195
4196    aSig1 = extractFloat128Frac1( a );
4197    aSig0 = extractFloat128Frac0( a );
4198    aExp = extractFloat128Exp( a );
4199    aSign = extractFloat128Sign( a );
4200    aSig0 |= ( aSig1 != 0 );
4201    if ( 0x401E < aExp ) {
4202        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4203        goto invalid;
4204    }
4205    else if ( aExp < 0x3FFF ) {
4206        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4207        return 0;
4208    }
4209    aSig0 |= LIT64( 0x0001000000000000 );
4210    shiftCount = 0x402F - aExp;
4211    savedASig = aSig0;
4212    aSig0 >>= shiftCount;
4213    z = aSig0;
4214    if ( aSign ) z = - z;
4215    if ( ( z < 0 ) ^ aSign ) {
4216 invalid:
4217        float_raise( float_flag_invalid STATUS_VAR);
4218        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4219    }
4220    if ( ( aSig0<<shiftCount ) != savedASig ) {
4221        STATUS(float_exception_flags) |= float_flag_inexact;
4222    }
4223    return z;
4224
4225}
4226
4227/*----------------------------------------------------------------------------
4228| Returns the result of converting the quadruple-precision floating-point
4229| value `a' to the 64-bit two's complement integer format.  The conversion
4230| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4231| Arithmetic---which means in particular that the conversion is rounded
4232| according to the current rounding mode.  If `a' is a NaN, the largest
4233| positive integer is returned.  Otherwise, if the conversion overflows, the
4234| largest integer with the same sign as `a' is returned.
4235*----------------------------------------------------------------------------*/
4236
4237int64 float128_to_int64( float128 a STATUS_PARAM )
4238{
4239    flag aSign;
4240    int32 aExp, shiftCount;
4241    bits64 aSig0, aSig1;
4242
4243    aSig1 = extractFloat128Frac1( a );
4244    aSig0 = extractFloat128Frac0( a );
4245    aExp = extractFloat128Exp( a );
4246    aSign = extractFloat128Sign( a );
4247    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4248    shiftCount = 0x402F - aExp;
4249    if ( shiftCount <= 0 ) {
4250        if ( 0x403E < aExp ) {
4251            float_raise( float_flag_invalid STATUS_VAR);
4252            if (    ! aSign
4253                 || (    ( aExp == 0x7FFF )
4254                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4255                    )
4256               ) {
4257                return LIT64( 0x7FFFFFFFFFFFFFFF );
4258            }
4259            return (sbits64) LIT64( 0x8000000000000000 );
4260        }
4261        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4262    }
4263    else {
4264        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4265    }
4266    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4267
4268}
4269
4270/*----------------------------------------------------------------------------
4271| Returns the result of converting the quadruple-precision floating-point
4272| value `a' to the 64-bit two's complement integer format.  The conversion
4273| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274| Arithmetic, except that the conversion is always rounded toward zero.
4275| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4276| the conversion overflows, the largest integer with the same sign as `a' is
4277| returned.
4278*----------------------------------------------------------------------------*/
4279
4280int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4281{
4282    flag aSign;
4283    int32 aExp, shiftCount;
4284    bits64 aSig0, aSig1;
4285    int64 z;
4286
4287    aSig1 = extractFloat128Frac1( a );
4288    aSig0 = extractFloat128Frac0( a );
4289    aExp = extractFloat128Exp( a );
4290    aSign = extractFloat128Sign( a );
4291    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4292    shiftCount = aExp - 0x402F;
4293    if ( 0 < shiftCount ) {
4294        if ( 0x403E <= aExp ) {
4295            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4296            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4297                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4298                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4299            }
4300            else {
4301                float_raise( float_flag_invalid STATUS_VAR);
4302                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4303                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4304                }
4305            }
4306            return (sbits64) LIT64( 0x8000000000000000 );
4307        }
4308        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4309        if ( (bits64) ( aSig1<<shiftCount ) ) {
4310            STATUS(float_exception_flags) |= float_flag_inexact;
4311        }
4312    }
4313    else {
4314        if ( aExp < 0x3FFF ) {
4315            if ( aExp | aSig0 | aSig1 ) {
4316                STATUS(float_exception_flags) |= float_flag_inexact;
4317            }
4318            return 0;
4319        }
4320        z = aSig0>>( - shiftCount );
4321        if (    aSig1
4322             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4323            STATUS(float_exception_flags) |= float_flag_inexact;
4324        }
4325    }
4326    if ( aSign ) z = - z;
4327    return z;
4328
4329}
4330
4331/*----------------------------------------------------------------------------
4332| Returns the result of converting the quadruple-precision floating-point
4333| value `a' to the single-precision floating-point format.  The conversion
4334| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335| Arithmetic.
4336*----------------------------------------------------------------------------*/
4337
4338float32 float128_to_float32( float128 a STATUS_PARAM )
4339{
4340    flag aSign;
4341    int32 aExp;
4342    bits64 aSig0, aSig1;
4343    bits32 zSig;
4344
4345    aSig1 = extractFloat128Frac1( a );
4346    aSig0 = extractFloat128Frac0( a );
4347    aExp = extractFloat128Exp( a );
4348    aSign = extractFloat128Sign( a );
4349    if ( aExp == 0x7FFF ) {
4350        if ( aSig0 | aSig1 ) {
4351            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4352        }
4353        return packFloat32( aSign, 0xFF, 0 );
4354    }
4355    aSig0 |= ( aSig1 != 0 );
4356    shift64RightJamming( aSig0, 18, &aSig0 );
4357    zSig = aSig0;
4358    if ( aExp || zSig ) {
4359        zSig |= 0x40000000;
4360        aExp -= 0x3F81;
4361    }
4362    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4363
4364}
4365
4366/*----------------------------------------------------------------------------
4367| Returns the result of converting the quadruple-precision floating-point
4368| value `a' to the double-precision floating-point format.  The conversion
4369| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4370| Arithmetic.
4371*----------------------------------------------------------------------------*/
4372
4373float64 float128_to_float64( float128 a STATUS_PARAM )
4374{
4375    flag aSign;
4376    int32 aExp;
4377    bits64 aSig0, aSig1;
4378
4379    aSig1 = extractFloat128Frac1( a );
4380    aSig0 = extractFloat128Frac0( a );
4381    aExp = extractFloat128Exp( a );
4382    aSign = extractFloat128Sign( a );
4383    if ( aExp == 0x7FFF ) {
4384        if ( aSig0 | aSig1 ) {
4385            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4386        }
4387        return packFloat64( aSign, 0x7FF, 0 );
4388    }
4389    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4390    aSig0 |= ( aSig1 != 0 );
4391    if ( aExp || aSig0 ) {
4392        aSig0 |= LIT64( 0x4000000000000000 );
4393        aExp -= 0x3C01;
4394    }
4395    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4396
4397}
4398
4399#ifdef FLOATX80
4400
4401/*----------------------------------------------------------------------------
4402| Returns the result of converting the quadruple-precision floating-point
4403| value `a' to the extended double-precision floating-point format.  The
4404| conversion is performed according to the IEC/IEEE Standard for Binary
4405| Floating-Point Arithmetic.
4406*----------------------------------------------------------------------------*/
4407
4408floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4409{
4410    flag aSign;
4411    int32 aExp;
4412    bits64 aSig0, aSig1;
4413
4414    aSig1 = extractFloat128Frac1( a );
4415    aSig0 = extractFloat128Frac0( a );
4416    aExp = extractFloat128Exp( a );
4417    aSign = extractFloat128Sign( a );
4418    if ( aExp == 0x7FFF ) {
4419        if ( aSig0 | aSig1 ) {
4420            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4421        }
4422        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4423    }
4424    if ( aExp == 0 ) {
4425        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4426        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4427    }
4428    else {
4429        aSig0 |= LIT64( 0x0001000000000000 );
4430    }
4431    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4432    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4433
4434}
4435
4436#endif
4437
4438/*----------------------------------------------------------------------------
4439| Rounds the quadruple-precision floating-point value `a' to an integer, and
4440| returns the result as a quadruple-precision floating-point value.  The
4441| operation is performed according to the IEC/IEEE Standard for Binary
4442| Floating-Point Arithmetic.
4443*----------------------------------------------------------------------------*/
4444
4445float128 float128_round_to_int( float128 a STATUS_PARAM )
4446{
4447    flag aSign;
4448    int32 aExp;
4449    bits64 lastBitMask, roundBitsMask;
4450    int8 roundingMode;
4451    float128 z;
4452
4453    aExp = extractFloat128Exp( a );
4454    if ( 0x402F <= aExp ) {
4455        if ( 0x406F <= aExp ) {
4456            if (    ( aExp == 0x7FFF )
4457                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4458               ) {
4459                return propagateFloat128NaN( a, a STATUS_VAR );
4460            }
4461            return a;
4462        }
4463        lastBitMask = 1;
4464        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4465        roundBitsMask = lastBitMask - 1;
4466        z = a;
4467        roundingMode = STATUS(float_rounding_mode);
4468        if ( roundingMode == float_round_nearest_even ) {
4469            if ( lastBitMask ) {
4470                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4471                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4472            }
4473            else {
4474                if ( (sbits64) z.low < 0 ) {
4475                    ++z.high;
4476                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4477                }
4478            }
4479        }
4480        else if ( roundingMode != float_round_to_zero ) {
4481            if (   extractFloat128Sign( z )
4482                 ^ ( roundingMode == float_round_up ) ) {
4483                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4484            }
4485        }
4486        z.low &= ~ roundBitsMask;
4487    }
4488    else {
4489        if ( aExp < 0x3FFF ) {
4490            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4491            STATUS(float_exception_flags) |= float_flag_inexact;
4492            aSign = extractFloat128Sign( a );
4493            switch ( STATUS(float_rounding_mode) ) {
4494             case float_round_nearest_even:
4495                if (    ( aExp == 0x3FFE )
4496                     && (   extractFloat128Frac0( a )
4497                          | extractFloat128Frac1( a ) )
4498                   ) {
4499                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4500                }
4501                break;
4502             case float_round_down:
4503                return
4504                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4505                    : packFloat128( 0, 0, 0, 0 );
4506             case float_round_up:
4507                return
4508                      aSign ? packFloat128( 1, 0, 0, 0 )
4509                    : packFloat128( 0, 0x3FFF, 0, 0 );
4510            }
4511            return packFloat128( aSign, 0, 0, 0 );
4512        }
4513        lastBitMask = 1;
4514        lastBitMask <<= 0x402F - aExp;
4515        roundBitsMask = lastBitMask - 1;
4516        z.low = 0;
4517        z.high = a.high;
4518        roundingMode = STATUS(float_rounding_mode);
4519        if ( roundingMode == float_round_nearest_even ) {
4520            z.high += lastBitMask>>1;
4521            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4522                z.high &= ~ lastBitMask;
4523            }
4524        }
4525        else if ( roundingMode != float_round_to_zero ) {
4526            if (   extractFloat128Sign( z )
4527                 ^ ( roundingMode == float_round_up ) ) {
4528                z.high |= ( a.low != 0 );
4529                z.high += roundBitsMask;
4530            }
4531        }
4532        z.high &= ~ roundBitsMask;
4533    }
4534    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4535        STATUS(float_exception_flags) |= float_flag_inexact;
4536    }
4537    return z;
4538
4539}
4540
4541/*----------------------------------------------------------------------------
4542| Returns the result of adding the absolute values of the quadruple-precision
4543| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4544| before being returned.  `zSign' is ignored if the result is a NaN.
4545| The addition is performed according to the IEC/IEEE Standard for Binary
4546| Floating-Point Arithmetic.
4547*----------------------------------------------------------------------------*/
4548
4549static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4550{
4551    int32 aExp, bExp, zExp;
4552    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4553    int32 expDiff;
4554
4555    aSig1 = extractFloat128Frac1( a );
4556    aSig0 = extractFloat128Frac0( a );
4557    aExp = extractFloat128Exp( a );
4558    bSig1 = extractFloat128Frac1( b );
4559    bSig0 = extractFloat128Frac0( b );
4560    bExp = extractFloat128Exp( b );
4561    expDiff = aExp - bExp;
4562    if ( 0 < expDiff ) {
4563        if ( aExp == 0x7FFF ) {
4564            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4565            return a;
4566        }
4567        if ( bExp == 0 ) {
4568            --expDiff;
4569        }
4570        else {
4571            bSig0 |= LIT64( 0x0001000000000000 );
4572        }
4573        shift128ExtraRightJamming(
4574            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4575        zExp = aExp;
4576    }
4577    else if ( expDiff < 0 ) {
4578        if ( bExp == 0x7FFF ) {
4579            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4580            return packFloat128( zSign, 0x7FFF, 0, 0 );
4581        }
4582        if ( aExp == 0 ) {
4583            ++expDiff;
4584        }
4585        else {
4586            aSig0 |= LIT64( 0x0001000000000000 );
4587        }
4588        shift128ExtraRightJamming(
4589            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4590        zExp = bExp;
4591    }
4592    else {
4593        if ( aExp == 0x7FFF ) {
4594            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4595                return propagateFloat128NaN( a, b STATUS_VAR );
4596            }
4597            return a;
4598        }
4599        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4600        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4601        zSig2 = 0;
4602        zSig0 |= LIT64( 0x0002000000000000 );
4603        zExp = aExp;
4604        goto shiftRight1;
4605    }
4606    aSig0 |= LIT64( 0x0001000000000000 );
4607    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4608    --zExp;
4609    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4610    ++zExp;
4611 shiftRight1:
4612    shift128ExtraRightJamming(
4613        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4614 roundAndPack:
4615    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4616
4617}
4618
4619/*----------------------------------------------------------------------------
4620| Returns the result of subtracting the absolute values of the quadruple-
4621| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4622| difference is negated before being returned.  `zSign' is ignored if the
4623| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4624| Standard for Binary Floating-Point Arithmetic.
4625*----------------------------------------------------------------------------*/
4626
4627static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4628{
4629    int32 aExp, bExp, zExp;
4630    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4631    int32 expDiff;
4632    float128 z;
4633
4634    aSig1 = extractFloat128Frac1( a );
4635    aSig0 = extractFloat128Frac0( a );
4636    aExp = extractFloat128Exp( a );
4637    bSig1 = extractFloat128Frac1( b );
4638    bSig0 = extractFloat128Frac0( b );
4639    bExp = extractFloat128Exp( b );
4640    expDiff = aExp - bExp;
4641    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4642    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4643    if ( 0 < expDiff ) goto aExpBigger;
4644    if ( expDiff < 0 ) goto bExpBigger;
4645    if ( aExp == 0x7FFF ) {
4646        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4647            return propagateFloat128NaN( a, b STATUS_VAR );
4648        }
4649        float_raise( float_flag_invalid STATUS_VAR);
4650        z.low = float128_default_nan_low;
4651        z.high = float128_default_nan_high;
4652        return z;
4653    }
4654    if ( aExp == 0 ) {
4655        aExp = 1;
4656        bExp = 1;
4657    }
4658    if ( bSig0 < aSig0 ) goto aBigger;
4659    if ( aSig0 < bSig0 ) goto bBigger;
4660    if ( bSig1 < aSig1 ) goto aBigger;
4661    if ( aSig1 < bSig1 ) goto bBigger;
4662    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4663 bExpBigger:
4664    if ( bExp == 0x7FFF ) {
4665        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4666        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4667    }
4668    if ( aExp == 0 ) {
4669        ++expDiff;
4670    }
4671    else {
4672        aSig0 |= LIT64( 0x4000000000000000 );
4673    }
4674    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4675    bSig0 |= LIT64( 0x4000000000000000 );
4676 bBigger:
4677    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4678    zExp = bExp;
4679    zSign ^= 1;
4680    goto normalizeRoundAndPack;
4681 aExpBigger:
4682    if ( aExp == 0x7FFF ) {
4683        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4684        return a;
4685    }
4686    if ( bExp == 0 ) {
4687        --expDiff;
4688    }
4689    else {
4690        bSig0 |= LIT64( 0x4000000000000000 );
4691    }
4692    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4693    aSig0 |= LIT64( 0x4000000000000000 );
4694 aBigger:
4695    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4696    zExp = aExp;
4697 normalizeRoundAndPack:
4698    --zExp;
4699    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4700
4701}
4702
4703/*----------------------------------------------------------------------------
4704| Returns the result of adding the quadruple-precision floating-point values
4705| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4706| for Binary Floating-Point Arithmetic.
4707*----------------------------------------------------------------------------*/
4708
4709float128 float128_add( float128 a, float128 b STATUS_PARAM )
4710{
4711    flag aSign, bSign;
4712
4713    aSign = extractFloat128Sign( a );
4714    bSign = extractFloat128Sign( b );
4715    if ( aSign == bSign ) {
4716        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4717    }
4718    else {
4719        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4720    }
4721
4722}
4723
4724/*----------------------------------------------------------------------------
4725| Returns the result of subtracting the quadruple-precision floating-point
4726| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4727| Standard for Binary Floating-Point Arithmetic.
4728*----------------------------------------------------------------------------*/
4729
4730float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4731{
4732    flag aSign, bSign;
4733
4734    aSign = extractFloat128Sign( a );
4735    bSign = extractFloat128Sign( b );
4736    if ( aSign == bSign ) {
4737        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4738    }
4739    else {
4740        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4741    }
4742
4743}
4744
4745/*----------------------------------------------------------------------------
4746| Returns the result of multiplying the quadruple-precision floating-point
4747| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4748| Standard for Binary Floating-Point Arithmetic.
4749*----------------------------------------------------------------------------*/
4750
4751float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4752{
4753    flag aSign, bSign, zSign;
4754    int32 aExp, bExp, zExp;
4755    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4756    float128 z;
4757
4758    aSig1 = extractFloat128Frac1( a );
4759    aSig0 = extractFloat128Frac0( a );
4760    aExp = extractFloat128Exp( a );
4761    aSign = extractFloat128Sign( a );
4762    bSig1 = extractFloat128Frac1( b );
4763    bSig0 = extractFloat128Frac0( b );
4764    bExp = extractFloat128Exp( b );
4765    bSign = extractFloat128Sign( b );
4766    zSign = aSign ^ bSign;
4767    if ( aExp == 0x7FFF ) {
4768        if (    ( aSig0 | aSig1 )
4769             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4770            return propagateFloat128NaN( a, b STATUS_VAR );
4771        }
4772        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4773        return packFloat128( zSign, 0x7FFF, 0, 0 );
4774    }
4775    if ( bExp == 0x7FFF ) {
4776        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4778 invalid:
4779            float_raise( float_flag_invalid STATUS_VAR);
4780            z.low = float128_default_nan_low;
4781            z.high = float128_default_nan_high;
4782            return z;
4783        }
4784        return packFloat128( zSign, 0x7FFF, 0, 0 );
4785    }
4786    if ( aExp == 0 ) {
4787        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4788        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4789    }
4790    if ( bExp == 0 ) {
4791        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4792        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4793    }
4794    zExp = aExp + bExp - 0x4000;
4795    aSig0 |= LIT64( 0x0001000000000000 );
4796    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4797    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4798    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4799    zSig2 |= ( zSig3 != 0 );
4800    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4801        shift128ExtraRightJamming(
4802            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4803        ++zExp;
4804    }
4805    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4806
4807}
4808
4809/*----------------------------------------------------------------------------
4810| Returns the result of dividing the quadruple-precision floating-point value
4811| `a' by the corresponding value `b'.  The operation is performed according to
4812| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813*----------------------------------------------------------------------------*/
4814
4815float128 float128_div( float128 a, float128 b STATUS_PARAM )
4816{
4817    flag aSign, bSign, zSign;
4818    int32 aExp, bExp, zExp;
4819    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4820    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4821    float128 z;
4822
4823    aSig1 = extractFloat128Frac1( a );
4824    aSig0 = extractFloat128Frac0( a );
4825    aExp = extractFloat128Exp( a );
4826    aSign = extractFloat128Sign( a );
4827    bSig1 = extractFloat128Frac1( b );
4828    bSig0 = extractFloat128Frac0( b );
4829    bExp = extractFloat128Exp( b );
4830    bSign = extractFloat128Sign( b );
4831    zSign = aSign ^ bSign;
4832    if ( aExp == 0x7FFF ) {
4833        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4834        if ( bExp == 0x7FFF ) {
4835            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4836            goto invalid;
4837        }
4838        return packFloat128( zSign, 0x7FFF, 0, 0 );
4839    }
4840    if ( bExp == 0x7FFF ) {
4841        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4842        return packFloat128( zSign, 0, 0, 0 );
4843    }
4844    if ( bExp == 0 ) {
4845        if ( ( bSig0 | bSig1 ) == 0 ) {
4846            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4847 invalid:
4848                float_raise( float_flag_invalid STATUS_VAR);
4849                z.low = float128_default_nan_low;
4850                z.high = float128_default_nan_high;
4851                return z;
4852            }
4853            float_raise( float_flag_divbyzero STATUS_VAR);
4854            return packFloat128( zSign, 0x7FFF, 0, 0 );
4855        }
4856        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4857    }
4858    if ( aExp == 0 ) {
4859        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4861    }
4862    zExp = aExp - bExp + 0x3FFD;
4863    shortShift128Left(
4864        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4865    shortShift128Left(
4866        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4867    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4868        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4869        ++zExp;
4870    }
4871    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4872    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4873    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4874    while ( (sbits64) rem0 < 0 ) {
4875        --zSig0;
4876        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4877    }
4878    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4879    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4880        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4881        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4882        while ( (sbits64) rem1 < 0 ) {
4883            --zSig1;
4884            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4885        }
4886        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4887    }
4888    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4889    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4890
4891}
4892
4893/*----------------------------------------------------------------------------
4894| Returns the remainder of the quadruple-precision floating-point value `a'
4895| with respect to the corresponding value `b'.  The operation is performed
4896| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4897*----------------------------------------------------------------------------*/
4898
4899float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4900{
4901    flag aSign, bSign, zSign;
4902    int32 aExp, bExp, expDiff;
4903    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4904    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4905    sbits64 sigMean0;
4906    float128 z;
4907
4908    aSig1 = extractFloat128Frac1( a );
4909    aSig0 = extractFloat128Frac0( a );
4910    aExp = extractFloat128Exp( a );
4911    aSign = extractFloat128Sign( a );
4912    bSig1 = extractFloat128Frac1( b );
4913    bSig0 = extractFloat128Frac0( b );
4914    bExp = extractFloat128Exp( b );
4915    bSign = extractFloat128Sign( b );
4916    if ( aExp == 0x7FFF ) {
4917        if (    ( aSig0 | aSig1 )
4918             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4919            return propagateFloat128NaN( a, b STATUS_VAR );
4920        }
4921        goto invalid;
4922    }
4923    if ( bExp == 0x7FFF ) {
4924        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4925        return a;
4926    }
4927    if ( bExp == 0 ) {
4928        if ( ( bSig0 | bSig1 ) == 0 ) {
4929 invalid:
4930            float_raise( float_flag_invalid STATUS_VAR);
4931            z.low = float128_default_nan_low;
4932            z.high = float128_default_nan_high;
4933            return z;
4934        }
4935        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4936    }
4937    if ( aExp == 0 ) {
4938        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4939        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4940    }
4941    expDiff = aExp - bExp;
4942    if ( expDiff < -1 ) return a;
4943    shortShift128Left(
4944        aSig0 | LIT64( 0x0001000000000000 ),
4945        aSig1,
4946        15 - ( expDiff < 0 ),
4947        &aSig0,
4948        &aSig1
4949    );
4950    shortShift128Left(
4951        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4952    q = le128( bSig0, bSig1, aSig0, aSig1 );
4953    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4954    expDiff -= 64;
4955    while ( 0 < expDiff ) {
4956        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4957        q = ( 4 < q ) ? q - 4 : 0;
4958        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4959        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4960        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4961        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4962        expDiff -= 61;
4963    }
4964    if ( -64 < expDiff ) {
4965        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4966        q = ( 4 < q ) ? q - 4 : 0;
4967        q >>= - expDiff;
4968        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4969        expDiff += 52;
4970        if ( expDiff < 0 ) {
4971            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4972        }
4973        else {
4974            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4975        }
4976        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4977        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4978    }
4979    else {
4980        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4981        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4982    }
4983    do {
4984        alternateASig0 = aSig0;
4985        alternateASig1 = aSig1;
4986        ++q;
4987        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4988    } while ( 0 <= (sbits64) aSig0 );
4989    add128(
4990        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4991    if (    ( sigMean0 < 0 )
4992         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4993        aSig0 = alternateASig0;
4994        aSig1 = alternateASig1;
4995    }
4996    zSign = ( (sbits64) aSig0 < 0 );
4997    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4998    return
4999        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5000
5001}
5002
5003/*----------------------------------------------------------------------------
5004| Returns the square root of the quadruple-precision floating-point value `a'.
5005| The operation is performed according to the IEC/IEEE Standard for Binary
5006| Floating-Point Arithmetic.
5007*----------------------------------------------------------------------------*/
5008
5009float128 float128_sqrt( float128 a STATUS_PARAM )
5010{
5011    flag aSign;
5012    int32 aExp, zExp;
5013    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5014    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5015    float128 z;
5016
5017    aSig1 = extractFloat128Frac1( a );
5018    aSig0 = extractFloat128Frac0( a );
5019    aExp = extractFloat128Exp( a );
5020    aSign = extractFloat128Sign( a );
5021    if ( aExp == 0x7FFF ) {
5022        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5023        if ( ! aSign ) return a;
5024        goto invalid;
5025    }
5026    if ( aSign ) {
5027        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5028 invalid:
5029        float_raise( float_flag_invalid STATUS_VAR);
5030        z.low = float128_default_nan_low;
5031        z.high = float128_default_nan_high;
5032        return z;
5033    }
5034    if ( aExp == 0 ) {
5035        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5036        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5037    }
5038    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5039    aSig0 |= LIT64( 0x0001000000000000 );
5040    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5041    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5042    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5043    doubleZSig0 = zSig0<<1;
5044    mul64To128( zSig0, zSig0, &term0, &term1 );
5045    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5046    while ( (sbits64) rem0 < 0 ) {
5047        --zSig0;
5048        doubleZSig0 -= 2;
5049        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5050    }
5051    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5052    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5053        if ( zSig1 == 0 ) zSig1 = 1;
5054        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5055        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5056        mul64To128( zSig1, zSig1, &term2, &term3 );
5057        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5058        while ( (sbits64) rem1 < 0 ) {
5059            --zSig1;
5060            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5061            term3 |= 1;
5062            term2 |= doubleZSig0;
5063            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5064        }
5065        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5066    }
5067    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5068    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5069
5070}
5071
5072/*----------------------------------------------------------------------------
5073| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5074| the corresponding value `b', and 0 otherwise.  The comparison is performed
5075| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076*----------------------------------------------------------------------------*/
5077
5078int float128_eq( float128 a, float128 b STATUS_PARAM )
5079{
5080
5081    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5082              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5083         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5084              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5085       ) {
5086        if (    float128_is_signaling_nan( a )
5087             || float128_is_signaling_nan( b ) ) {
5088            float_raise( float_flag_invalid STATUS_VAR);
5089        }
5090        return 0;
5091    }
5092    return
5093           ( a.low == b.low )
5094        && (    ( a.high == b.high )
5095             || (    ( a.low == 0 )
5096                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5097           );
5098
5099}
5100
5101/*----------------------------------------------------------------------------
5102| Returns 1 if the quadruple-precision floating-point value `a' is less than
5103| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5104| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5105| Arithmetic.
5106*----------------------------------------------------------------------------*/
5107
5108int float128_le( float128 a, float128 b STATUS_PARAM )
5109{
5110    flag aSign, bSign;
5111
5112    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5113              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5114         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5115              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5116       ) {
5117        float_raise( float_flag_invalid STATUS_VAR);
5118        return 0;
5119    }
5120    aSign = extractFloat128Sign( a );
5121    bSign = extractFloat128Sign( b );
5122    if ( aSign != bSign ) {
5123        return
5124               aSign
5125            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5126                 == 0 );
5127    }
5128    return
5129          aSign ? le128( b.high, b.low, a.high, a.low )
5130        : le128( a.high, a.low, b.high, b.low );
5131
5132}
5133
5134/*----------------------------------------------------------------------------
5135| Returns 1 if the quadruple-precision floating-point value `a' is less than
5136| the corresponding value `b', and 0 otherwise.  The comparison is performed
5137| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138*----------------------------------------------------------------------------*/
5139
5140int float128_lt( float128 a, float128 b STATUS_PARAM )
5141{
5142    flag aSign, bSign;
5143
5144    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5145              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5146         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5147              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5148       ) {
5149        float_raise( float_flag_invalid STATUS_VAR);
5150        return 0;
5151    }
5152    aSign = extractFloat128Sign( a );
5153    bSign = extractFloat128Sign( b );
5154    if ( aSign != bSign ) {
5155        return
5156               aSign
5157            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5158                 != 0 );
5159    }
5160    return
5161          aSign ? lt128( b.high, b.low, a.high, a.low )
5162        : lt128( a.high, a.low, b.high, b.low );
5163
5164}
5165
5166/*----------------------------------------------------------------------------
5167| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5168| the corresponding value `b', and 0 otherwise.  The invalid exception is
5169| raised if either operand is a NaN.  Otherwise, the comparison is performed
5170| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171*----------------------------------------------------------------------------*/
5172
5173int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5174{
5175
5176    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5177              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5178         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5179              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5180       ) {
5181        float_raise( float_flag_invalid STATUS_VAR);
5182        return 0;
5183    }
5184    return
5185           ( a.low == b.low )
5186        && (    ( a.high == b.high )
5187             || (    ( a.low == 0 )
5188                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5189           );
5190
5191}
5192
5193/*----------------------------------------------------------------------------
5194| Returns 1 if the quadruple-precision floating-point value `a' is less than
5195| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5196| cause an exception.  Otherwise, the comparison is performed according to the
5197| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198*----------------------------------------------------------------------------*/
5199
5200int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5201{
5202    flag aSign, bSign;
5203
5204    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5205              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5206         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5207              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5208       ) {
5209        if (    float128_is_signaling_nan( a )
5210             || float128_is_signaling_nan( b ) ) {
5211            float_raise( float_flag_invalid STATUS_VAR);
5212        }
5213        return 0;
5214    }
5215    aSign = extractFloat128Sign( a );
5216    bSign = extractFloat128Sign( b );
5217    if ( aSign != bSign ) {
5218        return
5219               aSign
5220            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5221                 == 0 );
5222    }
5223    return
5224          aSign ? le128( b.high, b.low, a.high, a.low )
5225        : le128( a.high, a.low, b.high, b.low );
5226
5227}
5228
5229/*----------------------------------------------------------------------------
5230| Returns 1 if the quadruple-precision floating-point value `a' is less than
5231| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5232| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5233| Standard for Binary Floating-Point Arithmetic.
5234*----------------------------------------------------------------------------*/
5235
5236int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5237{
5238    flag aSign, bSign;
5239
5240    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5241              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5243              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5244       ) {
5245        if (    float128_is_signaling_nan( a )
5246             || float128_is_signaling_nan( b ) ) {
5247            float_raise( float_flag_invalid STATUS_VAR);
5248        }
5249        return 0;
5250    }
5251    aSign = extractFloat128Sign( a );
5252    bSign = extractFloat128Sign( b );
5253    if ( aSign != bSign ) {
5254        return
5255               aSign
5256            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5257                 != 0 );
5258    }
5259    return
5260          aSign ? lt128( b.high, b.low, a.high, a.low )
5261        : lt128( a.high, a.low, b.high, b.low );
5262
5263}
5264
5265#endif
5266
5267/* misc functions */
5268float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5269{
5270    return int64_to_float32(a STATUS_VAR);
5271}
5272
5273float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5274{
5275    return int64_to_float64(a STATUS_VAR);
5276}
5277
5278unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5279{
5280    int64_t v;
5281    unsigned int res;
5282
5283    v = float32_to_int64(a STATUS_VAR);
5284    if (v < 0) {
5285        res = 0;
5286        float_raise( float_flag_invalid STATUS_VAR);
5287    } else if (v > 0xffffffff) {
5288        res = 0xffffffff;
5289        float_raise( float_flag_invalid STATUS_VAR);
5290    } else {
5291        res = v;
5292    }
5293    return res;
5294}
5295
5296unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5297{
5298    int64_t v;
5299    unsigned int res;
5300
5301    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5302    if (v < 0) {
5303        res = 0;
5304        float_raise( float_flag_invalid STATUS_VAR);
5305    } else if (v > 0xffffffff) {
5306        res = 0xffffffff;
5307        float_raise( float_flag_invalid STATUS_VAR);
5308    } else {
5309        res = v;
5310    }
5311    return res;
5312}
5313
5314unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5315{
5316    int64_t v;
5317    unsigned int res;
5318
5319    v = float64_to_int64(a STATUS_VAR);
5320    if (v < 0) {
5321        res = 0;
5322        float_raise( float_flag_invalid STATUS_VAR);
5323    } else if (v > 0xffffffff) {
5324        res = 0xffffffff;
5325        float_raise( float_flag_invalid STATUS_VAR);
5326    } else {
5327        res = v;
5328    }
5329    return res;
5330}
5331
5332unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5333{
5334    int64_t v;
5335    unsigned int res;
5336
5337    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5338    if (v < 0) {
5339        res = 0;
5340        float_raise( float_flag_invalid STATUS_VAR);
5341    } else if (v > 0xffffffff) {
5342        res = 0xffffffff;
5343        float_raise( float_flag_invalid STATUS_VAR);
5344    } else {
5345        res = v;
5346    }
5347    return res;
5348}
5349
5350/* FIXME: This looks broken.  */
5351uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5352{
5353    int64_t v;
5354
5355    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5356    v += float64_val(a);
5357    v = float64_to_int64(make_float64(v) STATUS_VAR);
5358
5359    return v - INT64_MIN;
5360}
5361
5362uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5363{
5364    int64_t v;
5365
5366    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5367    v += float64_val(a);
5368    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5369
5370    return v - INT64_MIN;
5371}
5372
5373#define COMPARE(s, nan_exp)                                                  \
5374INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5375                                      int is_quiet STATUS_PARAM )            \
5376{                                                                            \
5377    flag aSign, bSign;                                                       \
5378    bits ## s av, bv;                                                        \
5379                                                                             \
5380    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5381         extractFloat ## s ## Frac( a ) ) ||                                 \
5382        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5383          extractFloat ## s ## Frac( b ) )) {                                \
5384        if (!is_quiet ||                                                     \
5385            float ## s ## _is_signaling_nan( a ) ||                          \
5386            float ## s ## _is_signaling_nan( b ) ) {                         \
5387            float_raise( float_flag_invalid STATUS_VAR);                     \
5388        }                                                                    \
5389        return float_relation_unordered;                                     \
5390    }                                                                        \
5391    aSign = extractFloat ## s ## Sign( a );                                  \
5392    bSign = extractFloat ## s ## Sign( b );                                  \
5393    av = float ## s ## _val(a);                                              \
5394    bv = float ## s ## _val(b);                                              \
5395    if ( aSign != bSign ) {                                                  \
5396        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5397            /* zero case */                                                  \
5398            return float_relation_equal;                                     \
5399        } else {                                                             \
5400            return 1 - (2 * aSign);                                          \
5401        }                                                                    \
5402    } else {                                                                 \
5403        if (av == bv) {                                                      \
5404            return float_relation_equal;                                     \
5405        } else {                                                             \
5406            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5407        }                                                                    \
5408    }                                                                        \
5409}                                                                            \
5410                                                                             \
5411int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5412{                                                                            \
5413    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5414}                                                                            \
5415                                                                             \
5416int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5417{                                                                            \
5418    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5419}
5420
5421COMPARE(32, 0xff)
5422COMPARE(64, 0x7ff)
5423
5424INLINE int float128_compare_internal( float128 a, float128 b,
5425                                      int is_quiet STATUS_PARAM )
5426{
5427    flag aSign, bSign;
5428
5429    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5430          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5431        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5432          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5433        if (!is_quiet ||
5434            float128_is_signaling_nan( a ) ||
5435            float128_is_signaling_nan( b ) ) {
5436            float_raise( float_flag_invalid STATUS_VAR);
5437        }
5438        return float_relation_unordered;
5439    }
5440    aSign = extractFloat128Sign( a );
5441    bSign = extractFloat128Sign( b );
5442    if ( aSign != bSign ) {
5443        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5444            /* zero case */
5445            return float_relation_equal;
5446        } else {
5447            return 1 - (2 * aSign);
5448        }
5449    } else {
5450        if (a.low == b.low && a.high == b.high) {
5451            return float_relation_equal;
5452        } else {
5453            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5454        }
5455    }
5456}
5457
5458int float128_compare( float128 a, float128 b STATUS_PARAM )
5459{
5460    return float128_compare_internal(a, b, 0 STATUS_VAR);
5461}
5462
5463int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5464{
5465    return float128_compare_internal(a, b, 1 STATUS_VAR);
5466}
5467
5468/* Multiply A by 2 raised to the power N.  */
5469float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5470{
5471    flag aSign;
5472    int16 aExp;
5473    bits32 aSig;
5474
5475    aSig = extractFloat32Frac( a );
5476    aExp = extractFloat32Exp( a );
5477    aSign = extractFloat32Sign( a );
5478
5479    if ( aExp == 0xFF ) {
5480        return a;
5481    }
5482    aExp += n;
5483    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5484}
5485
5486float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5487{
5488    flag aSign;
5489    int16 aExp;
5490    bits64 aSig;
5491
5492    aSig = extractFloat64Frac( a );
5493    aExp = extractFloat64Exp( a );
5494    aSign = extractFloat64Sign( a );
5495
5496    if ( aExp == 0x7FF ) {
5497        return a;
5498    }
5499    aExp += n;
5500    return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5501}
5502
5503#ifdef FLOATX80
5504floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5505{
5506    flag aSign;
5507    int16 aExp;
5508    bits64 aSig;
5509
5510    aSig = extractFloatx80Frac( a );
5511    aExp = extractFloatx80Exp( a );
5512    aSign = extractFloatx80Sign( a );
5513
5514    if ( aExp == 0x7FF ) {
5515        return a;
5516    }
5517    aExp += n;
5518    return roundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5519                                 aSign, aExp, aSig, 0 STATUS_VAR );
5520}
5521#endif
5522
5523#ifdef FLOAT128
5524float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5525{
5526    flag aSign;
5527    int32 aExp;
5528    bits64 aSig0, aSig1;
5529
5530    aSig1 = extractFloat128Frac1( a );
5531    aSig0 = extractFloat128Frac0( a );
5532    aExp = extractFloat128Exp( a );
5533    aSign = extractFloat128Sign( a );
5534    if ( aExp == 0x7FFF ) {
5535        return a;
5536    }
5537    aExp += n;
5538    return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR );
5539
5540}
5541#endif
5542