softfloat.c revision c27f813900a3c114562efbb8df1065e94766fc48
1 2/*============================================================================ 3 4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 5Package, Release 2b. 6 7Written by John R. Hauser. This work was made possible in part by the 8International Computer Science Institute, located at Suite 600, 1947 Center 9Street, Berkeley, California 94704. Funding was partially provided by the 10National Science Foundation under grant MIP-9311980. The original version 11of this code was written as part of a project to build a fixed-point vector 12processor in collaboration with the University of California at Berkeley, 13overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 15arithmetic/SoftFloat.html'. 16 17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 19RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 25 26Derivative works are acceptable, even for commercial purposes, so long as 27(1) the source code for the derivative work includes prominent notice that 28the work is derivative, and (2) the source code includes prominent notice with 29these four paragraphs for those parts of this code that are retained. 30 31=============================================================================*/ 32 33#include "softfloat.h" 34 35/*---------------------------------------------------------------------------- 36| Primitive arithmetic functions, including multi-word arithmetic, and 37| division and square root approximations. (Can be specialized to target if 38| desired.) 39*----------------------------------------------------------------------------*/ 40#include "softfloat-macros.h" 41 42/*---------------------------------------------------------------------------- 43| Functions and definitions to determine: (1) whether tininess for underflow 44| is detected before or after rounding by default, (2) what (if anything) 45| happens when exceptions are raised, (3) how signaling NaNs are distinguished 46| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 47| are propagated from function inputs to output. These details are target- 48| specific. 49*----------------------------------------------------------------------------*/ 50#include "softfloat-specialize.h" 51 52void set_float_rounding_mode(int val STATUS_PARAM) 53{ 54 STATUS(float_rounding_mode) = val; 55} 56 57void set_float_exception_flags(int val STATUS_PARAM) 58{ 59 STATUS(float_exception_flags) = val; 60} 61 62#ifdef FLOATX80 63void set_floatx80_rounding_precision(int val STATUS_PARAM) 64{ 65 STATUS(floatx80_rounding_precision) = val; 66} 67#endif 68 69/*---------------------------------------------------------------------------- 70| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 71| and 7, and returns the properly rounded 32-bit integer corresponding to the 72| input. If `zSign' is 1, the input is negated before being converted to an 73| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 74| is simply rounded to an integer, with the inexact exception raised if the 75| input cannot be represented exactly as an integer. However, if the fixed- 76| point input is too large, the invalid exception is raised and the largest 77| positive or negative integer is returned. 78*----------------------------------------------------------------------------*/ 79 80static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM) 81{ 82 int8 roundingMode; 83 flag roundNearestEven; 84 int8 roundIncrement, roundBits; 85 int32 z; 86 87 roundingMode = STATUS(float_rounding_mode); 88 roundNearestEven = ( roundingMode == float_round_nearest_even ); 89 roundIncrement = 0x40; 90 if ( ! roundNearestEven ) { 91 if ( roundingMode == float_round_to_zero ) { 92 roundIncrement = 0; 93 } 94 else { 95 roundIncrement = 0x7F; 96 if ( zSign ) { 97 if ( roundingMode == float_round_up ) roundIncrement = 0; 98 } 99 else { 100 if ( roundingMode == float_round_down ) roundIncrement = 0; 101 } 102 } 103 } 104 roundBits = absZ & 0x7F; 105 absZ = ( absZ + roundIncrement )>>7; 106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 107 z = absZ; 108 if ( zSign ) z = - z; 109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 110 float_raise( float_flag_invalid STATUS_VAR); 111 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 112 } 113 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 114 return z; 115 116} 117 118/*---------------------------------------------------------------------------- 119| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 120| `absZ1', with binary point between bits 63 and 64 (between the input words), 121| and returns the properly rounded 64-bit integer corresponding to the input. 122| If `zSign' is 1, the input is negated before being converted to an integer. 123| Ordinarily, the fixed-point input is simply rounded to an integer, with 124| the inexact exception raised if the input cannot be represented exactly as 125| an integer. However, if the fixed-point input is too large, the invalid 126| exception is raised and the largest positive or negative integer is 127| returned. 128*----------------------------------------------------------------------------*/ 129 130static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM) 131{ 132 int8 roundingMode; 133 flag roundNearestEven, increment; 134 int64 z; 135 136 roundingMode = STATUS(float_rounding_mode); 137 roundNearestEven = ( roundingMode == float_round_nearest_even ); 138 increment = ( (sbits64) absZ1 < 0 ); 139 if ( ! roundNearestEven ) { 140 if ( roundingMode == float_round_to_zero ) { 141 increment = 0; 142 } 143 else { 144 if ( zSign ) { 145 increment = ( roundingMode == float_round_down ) && absZ1; 146 } 147 else { 148 increment = ( roundingMode == float_round_up ) && absZ1; 149 } 150 } 151 } 152 if ( increment ) { 153 ++absZ0; 154 if ( absZ0 == 0 ) goto overflow; 155 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 156 } 157 z = absZ0; 158 if ( zSign ) z = - z; 159 if ( z && ( ( z < 0 ) ^ zSign ) ) { 160 overflow: 161 float_raise( float_flag_invalid STATUS_VAR); 162 return 163 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 164 : LIT64( 0x7FFFFFFFFFFFFFFF ); 165 } 166 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 167 return z; 168 169} 170 171/*---------------------------------------------------------------------------- 172| Returns the fraction bits of the single-precision floating-point value `a'. 173*----------------------------------------------------------------------------*/ 174 175INLINE bits32 extractFloat32Frac( float32 a ) 176{ 177 178 return float32_val(a) & 0x007FFFFF; 179 180} 181 182/*---------------------------------------------------------------------------- 183| Returns the exponent bits of the single-precision floating-point value `a'. 184*----------------------------------------------------------------------------*/ 185 186INLINE int16 extractFloat32Exp( float32 a ) 187{ 188 189 return ( float32_val(a)>>23 ) & 0xFF; 190 191} 192 193/*---------------------------------------------------------------------------- 194| Returns the sign bit of the single-precision floating-point value `a'. 195*----------------------------------------------------------------------------*/ 196 197INLINE flag extractFloat32Sign( float32 a ) 198{ 199 200 return float32_val(a)>>31; 201 202} 203 204/*---------------------------------------------------------------------------- 205| Normalizes the subnormal single-precision floating-point value represented 206| by the denormalized significand `aSig'. The normalized exponent and 207| significand are stored at the locations pointed to by `zExpPtr' and 208| `zSigPtr', respectively. 209*----------------------------------------------------------------------------*/ 210 211static void 212 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 213{ 214 int8 shiftCount; 215 216 shiftCount = countLeadingZeros32( aSig ) - 8; 217 *zSigPtr = aSig<<shiftCount; 218 *zExpPtr = 1 - shiftCount; 219 220} 221 222/*---------------------------------------------------------------------------- 223| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 224| single-precision floating-point value, returning the result. After being 225| shifted into the proper positions, the three fields are simply added 226| together to form the result. This means that any integer portion of `zSig' 227| will be added into the exponent. Since a properly normalized significand 228| will have an integer portion equal to 1, the `zExp' input should be 1 less 229| than the desired result exponent whenever `zSig' is a complete, normalized 230| significand. 231*----------------------------------------------------------------------------*/ 232 233INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 234{ 235 236 return make_float32( 237 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig); 238 239} 240 241/*---------------------------------------------------------------------------- 242| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 243| and significand `zSig', and returns the proper single-precision floating- 244| point value corresponding to the abstract input. Ordinarily, the abstract 245| value is simply rounded and packed into the single-precision format, with 246| the inexact exception raised if the abstract input cannot be represented 247| exactly. However, if the abstract value is too large, the overflow and 248| inexact exceptions are raised and an infinity or maximal finite value is 249| returned. If the abstract value is too small, the input value is rounded to 250| a subnormal number, and the underflow and inexact exceptions are raised if 251| the abstract input cannot be represented exactly as a subnormal single- 252| precision floating-point number. 253| The input significand `zSig' has its binary point between bits 30 254| and 29, which is 7 bits to the left of the usual location. This shifted 255| significand must be normalized or smaller. If `zSig' is not normalized, 256| `zExp' must be 0; in that case, the result returned is a subnormal number, 257| and it must not require rounding. In the usual case that `zSig' is 258| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 259| The handling of underflow and overflow follows the IEC/IEEE Standard for 260| Binary Floating-Point Arithmetic. 261*----------------------------------------------------------------------------*/ 262 263static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 264{ 265 int8 roundingMode; 266 flag roundNearestEven; 267 int8 roundIncrement, roundBits; 268 flag isTiny; 269 270 roundingMode = STATUS(float_rounding_mode); 271 roundNearestEven = ( roundingMode == float_round_nearest_even ); 272 roundIncrement = 0x40; 273 if ( ! roundNearestEven ) { 274 if ( roundingMode == float_round_to_zero ) { 275 roundIncrement = 0; 276 } 277 else { 278 roundIncrement = 0x7F; 279 if ( zSign ) { 280 if ( roundingMode == float_round_up ) roundIncrement = 0; 281 } 282 else { 283 if ( roundingMode == float_round_down ) roundIncrement = 0; 284 } 285 } 286 } 287 roundBits = zSig & 0x7F; 288 if ( 0xFD <= (bits16) zExp ) { 289 if ( ( 0xFD < zExp ) 290 || ( ( zExp == 0xFD ) 291 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 292 ) { 293 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 294 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 295 } 296 if ( zExp < 0 ) { 297 isTiny = 298 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 299 || ( zExp < -1 ) 300 || ( zSig + roundIncrement < 0x80000000 ); 301 shift32RightJamming( zSig, - zExp, &zSig ); 302 zExp = 0; 303 roundBits = zSig & 0x7F; 304 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 305 } 306 } 307 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 308 zSig = ( zSig + roundIncrement )>>7; 309 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 310 if ( zSig == 0 ) zExp = 0; 311 return packFloat32( zSign, zExp, zSig ); 312 313} 314 315/*---------------------------------------------------------------------------- 316| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 317| and significand `zSig', and returns the proper single-precision floating- 318| point value corresponding to the abstract input. This routine is just like 319| `roundAndPackFloat32' except that `zSig' does not have to be normalized. 320| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 321| floating-point exponent. 322*----------------------------------------------------------------------------*/ 323 324static float32 325 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 326{ 327 int8 shiftCount; 328 329 shiftCount = countLeadingZeros32( zSig ) - 1; 330 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 331 332} 333 334/*---------------------------------------------------------------------------- 335| Returns the fraction bits of the double-precision floating-point value `a'. 336*----------------------------------------------------------------------------*/ 337 338INLINE bits64 extractFloat64Frac( float64 a ) 339{ 340 341 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 342 343} 344 345/*---------------------------------------------------------------------------- 346| Returns the exponent bits of the double-precision floating-point value `a'. 347*----------------------------------------------------------------------------*/ 348 349INLINE int16 extractFloat64Exp( float64 a ) 350{ 351 352 return ( float64_val(a)>>52 ) & 0x7FF; 353 354} 355 356/*---------------------------------------------------------------------------- 357| Returns the sign bit of the double-precision floating-point value `a'. 358*----------------------------------------------------------------------------*/ 359 360INLINE flag extractFloat64Sign( float64 a ) 361{ 362 363 return float64_val(a)>>63; 364 365} 366 367/*---------------------------------------------------------------------------- 368| Normalizes the subnormal double-precision floating-point value represented 369| by the denormalized significand `aSig'. The normalized exponent and 370| significand are stored at the locations pointed to by `zExpPtr' and 371| `zSigPtr', respectively. 372*----------------------------------------------------------------------------*/ 373 374static void 375 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 376{ 377 int8 shiftCount; 378 379 shiftCount = countLeadingZeros64( aSig ) - 11; 380 *zSigPtr = aSig<<shiftCount; 381 *zExpPtr = 1 - shiftCount; 382 383} 384 385/*---------------------------------------------------------------------------- 386| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 387| double-precision floating-point value, returning the result. After being 388| shifted into the proper positions, the three fields are simply added 389| together to form the result. This means that any integer portion of `zSig' 390| will be added into the exponent. Since a properly normalized significand 391| will have an integer portion equal to 1, the `zExp' input should be 1 less 392| than the desired result exponent whenever `zSig' is a complete, normalized 393| significand. 394*----------------------------------------------------------------------------*/ 395 396INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 397{ 398 399 return make_float64( 400 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig); 401 402} 403 404/*---------------------------------------------------------------------------- 405| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 406| and significand `zSig', and returns the proper double-precision floating- 407| point value corresponding to the abstract input. Ordinarily, the abstract 408| value is simply rounded and packed into the double-precision format, with 409| the inexact exception raised if the abstract input cannot be represented 410| exactly. However, if the abstract value is too large, the overflow and 411| inexact exceptions are raised and an infinity or maximal finite value is 412| returned. If the abstract value is too small, the input value is rounded 413| to a subnormal number, and the underflow and inexact exceptions are raised 414| if the abstract input cannot be represented exactly as a subnormal double- 415| precision floating-point number. 416| The input significand `zSig' has its binary point between bits 62 417| and 61, which is 10 bits to the left of the usual location. This shifted 418| significand must be normalized or smaller. If `zSig' is not normalized, 419| `zExp' must be 0; in that case, the result returned is a subnormal number, 420| and it must not require rounding. In the usual case that `zSig' is 421| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 422| The handling of underflow and overflow follows the IEC/IEEE Standard for 423| Binary Floating-Point Arithmetic. 424*----------------------------------------------------------------------------*/ 425 426static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 427{ 428 int8 roundingMode; 429 flag roundNearestEven; 430 int16 roundIncrement, roundBits; 431 flag isTiny; 432 433 roundingMode = STATUS(float_rounding_mode); 434 roundNearestEven = ( roundingMode == float_round_nearest_even ); 435 roundIncrement = 0x200; 436 if ( ! roundNearestEven ) { 437 if ( roundingMode == float_round_to_zero ) { 438 roundIncrement = 0; 439 } 440 else { 441 roundIncrement = 0x3FF; 442 if ( zSign ) { 443 if ( roundingMode == float_round_up ) roundIncrement = 0; 444 } 445 else { 446 if ( roundingMode == float_round_down ) roundIncrement = 0; 447 } 448 } 449 } 450 roundBits = zSig & 0x3FF; 451 if ( 0x7FD <= (bits16) zExp ) { 452 if ( ( 0x7FD < zExp ) 453 || ( ( zExp == 0x7FD ) 454 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 455 ) { 456 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 457 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 458 } 459 if ( zExp < 0 ) { 460 isTiny = 461 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 462 || ( zExp < -1 ) 463 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 464 shift64RightJamming( zSig, - zExp, &zSig ); 465 zExp = 0; 466 roundBits = zSig & 0x3FF; 467 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 468 } 469 } 470 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 471 zSig = ( zSig + roundIncrement )>>10; 472 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 473 if ( zSig == 0 ) zExp = 0; 474 return packFloat64( zSign, zExp, zSig ); 475 476} 477 478/*---------------------------------------------------------------------------- 479| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 480| and significand `zSig', and returns the proper double-precision floating- 481| point value corresponding to the abstract input. This routine is just like 482| `roundAndPackFloat64' except that `zSig' does not have to be normalized. 483| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 484| floating-point exponent. 485*----------------------------------------------------------------------------*/ 486 487static float64 488 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 489{ 490 int8 shiftCount; 491 492 shiftCount = countLeadingZeros64( zSig ) - 1; 493 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 494 495} 496 497#ifdef FLOATX80 498 499/*---------------------------------------------------------------------------- 500| Returns the fraction bits of the extended double-precision floating-point 501| value `a'. 502*----------------------------------------------------------------------------*/ 503 504INLINE bits64 extractFloatx80Frac( floatx80 a ) 505{ 506 507 return a.low; 508 509} 510 511/*---------------------------------------------------------------------------- 512| Returns the exponent bits of the extended double-precision floating-point 513| value `a'. 514*----------------------------------------------------------------------------*/ 515 516INLINE int32 extractFloatx80Exp( floatx80 a ) 517{ 518 519 return a.high & 0x7FFF; 520 521} 522 523/*---------------------------------------------------------------------------- 524| Returns the sign bit of the extended double-precision floating-point value 525| `a'. 526*----------------------------------------------------------------------------*/ 527 528INLINE flag extractFloatx80Sign( floatx80 a ) 529{ 530 531 return a.high>>15; 532 533} 534 535/*---------------------------------------------------------------------------- 536| Normalizes the subnormal extended double-precision floating-point value 537| represented by the denormalized significand `aSig'. The normalized exponent 538| and significand are stored at the locations pointed to by `zExpPtr' and 539| `zSigPtr', respectively. 540*----------------------------------------------------------------------------*/ 541 542static void 543 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 544{ 545 int8 shiftCount; 546 547 shiftCount = countLeadingZeros64( aSig ); 548 *zSigPtr = aSig<<shiftCount; 549 *zExpPtr = 1 - shiftCount; 550 551} 552 553/*---------------------------------------------------------------------------- 554| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 555| extended double-precision floating-point value, returning the result. 556*----------------------------------------------------------------------------*/ 557 558INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 559{ 560 floatx80 z; 561 562 z.low = zSig; 563 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 564 return z; 565 566} 567 568/*---------------------------------------------------------------------------- 569| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 570| and extended significand formed by the concatenation of `zSig0' and `zSig1', 571| and returns the proper extended double-precision floating-point value 572| corresponding to the abstract input. Ordinarily, the abstract value is 573| rounded and packed into the extended double-precision format, with the 574| inexact exception raised if the abstract input cannot be represented 575| exactly. However, if the abstract value is too large, the overflow and 576| inexact exceptions are raised and an infinity or maximal finite value is 577| returned. If the abstract value is too small, the input value is rounded to 578| a subnormal number, and the underflow and inexact exceptions are raised if 579| the abstract input cannot be represented exactly as a subnormal extended 580| double-precision floating-point number. 581| If `roundingPrecision' is 32 or 64, the result is rounded to the same 582| number of bits as single or double precision, respectively. Otherwise, the 583| result is rounded to the full precision of the extended double-precision 584| format. 585| The input significand must be normalized or smaller. If the input 586| significand is not normalized, `zExp' must be 0; in that case, the result 587| returned is a subnormal number, and it must not require rounding. The 588| handling of underflow and overflow follows the IEC/IEEE Standard for Binary 589| Floating-Point Arithmetic. 590*----------------------------------------------------------------------------*/ 591 592static floatx80 593 roundAndPackFloatx80( 594 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 595 STATUS_PARAM) 596{ 597 int8 roundingMode; 598 flag roundNearestEven, increment, isTiny; 599 int64 roundIncrement, roundMask, roundBits; 600 601 roundingMode = STATUS(float_rounding_mode); 602 roundNearestEven = ( roundingMode == float_round_nearest_even ); 603 if ( roundingPrecision == 80 ) goto precision80; 604 if ( roundingPrecision == 64 ) { 605 roundIncrement = LIT64( 0x0000000000000400 ); 606 roundMask = LIT64( 0x00000000000007FF ); 607 } 608 else if ( roundingPrecision == 32 ) { 609 roundIncrement = LIT64( 0x0000008000000000 ); 610 roundMask = LIT64( 0x000000FFFFFFFFFF ); 611 } 612 else { 613 goto precision80; 614 } 615 zSig0 |= ( zSig1 != 0 ); 616 if ( ! roundNearestEven ) { 617 if ( roundingMode == float_round_to_zero ) { 618 roundIncrement = 0; 619 } 620 else { 621 roundIncrement = roundMask; 622 if ( zSign ) { 623 if ( roundingMode == float_round_up ) roundIncrement = 0; 624 } 625 else { 626 if ( roundingMode == float_round_down ) roundIncrement = 0; 627 } 628 } 629 } 630 roundBits = zSig0 & roundMask; 631 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 632 if ( ( 0x7FFE < zExp ) 633 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 634 ) { 635 goto overflow; 636 } 637 if ( zExp <= 0 ) { 638 isTiny = 639 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 640 || ( zExp < 0 ) 641 || ( zSig0 <= zSig0 + roundIncrement ); 642 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 643 zExp = 0; 644 roundBits = zSig0 & roundMask; 645 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 646 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 647 zSig0 += roundIncrement; 648 if ( (sbits64) zSig0 < 0 ) zExp = 1; 649 roundIncrement = roundMask + 1; 650 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 651 roundMask |= roundIncrement; 652 } 653 zSig0 &= ~ roundMask; 654 return packFloatx80( zSign, zExp, zSig0 ); 655 } 656 } 657 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 658 zSig0 += roundIncrement; 659 if ( zSig0 < roundIncrement ) { 660 ++zExp; 661 zSig0 = LIT64( 0x8000000000000000 ); 662 } 663 roundIncrement = roundMask + 1; 664 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 665 roundMask |= roundIncrement; 666 } 667 zSig0 &= ~ roundMask; 668 if ( zSig0 == 0 ) zExp = 0; 669 return packFloatx80( zSign, zExp, zSig0 ); 670 precision80: 671 increment = ( (sbits64) zSig1 < 0 ); 672 if ( ! roundNearestEven ) { 673 if ( roundingMode == float_round_to_zero ) { 674 increment = 0; 675 } 676 else { 677 if ( zSign ) { 678 increment = ( roundingMode == float_round_down ) && zSig1; 679 } 680 else { 681 increment = ( roundingMode == float_round_up ) && zSig1; 682 } 683 } 684 } 685 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 686 if ( ( 0x7FFE < zExp ) 687 || ( ( zExp == 0x7FFE ) 688 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 689 && increment 690 ) 691 ) { 692 roundMask = 0; 693 overflow: 694 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 695 if ( ( roundingMode == float_round_to_zero ) 696 || ( zSign && ( roundingMode == float_round_up ) ) 697 || ( ! zSign && ( roundingMode == float_round_down ) ) 698 ) { 699 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 700 } 701 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 702 } 703 if ( zExp <= 0 ) { 704 isTiny = 705 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 706 || ( zExp < 0 ) 707 || ! increment 708 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 709 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 710 zExp = 0; 711 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 712 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 713 if ( roundNearestEven ) { 714 increment = ( (sbits64) zSig1 < 0 ); 715 } 716 else { 717 if ( zSign ) { 718 increment = ( roundingMode == float_round_down ) && zSig1; 719 } 720 else { 721 increment = ( roundingMode == float_round_up ) && zSig1; 722 } 723 } 724 if ( increment ) { 725 ++zSig0; 726 zSig0 &= 727 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 728 if ( (sbits64) zSig0 < 0 ) zExp = 1; 729 } 730 return packFloatx80( zSign, zExp, zSig0 ); 731 } 732 } 733 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 734 if ( increment ) { 735 ++zSig0; 736 if ( zSig0 == 0 ) { 737 ++zExp; 738 zSig0 = LIT64( 0x8000000000000000 ); 739 } 740 else { 741 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 742 } 743 } 744 else { 745 if ( zSig0 == 0 ) zExp = 0; 746 } 747 return packFloatx80( zSign, zExp, zSig0 ); 748 749} 750 751/*---------------------------------------------------------------------------- 752| Takes an abstract floating-point value having sign `zSign', exponent 753| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 754| and returns the proper extended double-precision floating-point value 755| corresponding to the abstract input. This routine is just like 756| `roundAndPackFloatx80' except that the input significand does not have to be 757| normalized. 758*----------------------------------------------------------------------------*/ 759 760static floatx80 761 normalizeRoundAndPackFloatx80( 762 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 763 STATUS_PARAM) 764{ 765 int8 shiftCount; 766 767 if ( zSig0 == 0 ) { 768 zSig0 = zSig1; 769 zSig1 = 0; 770 zExp -= 64; 771 } 772 shiftCount = countLeadingZeros64( zSig0 ); 773 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 774 zExp -= shiftCount; 775 return 776 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 777 778} 779 780#endif 781 782#ifdef FLOAT128 783 784/*---------------------------------------------------------------------------- 785| Returns the least-significant 64 fraction bits of the quadruple-precision 786| floating-point value `a'. 787*----------------------------------------------------------------------------*/ 788 789INLINE bits64 extractFloat128Frac1( float128 a ) 790{ 791 792 return a.low; 793 794} 795 796/*---------------------------------------------------------------------------- 797| Returns the most-significant 48 fraction bits of the quadruple-precision 798| floating-point value `a'. 799*----------------------------------------------------------------------------*/ 800 801INLINE bits64 extractFloat128Frac0( float128 a ) 802{ 803 804 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 805 806} 807 808/*---------------------------------------------------------------------------- 809| Returns the exponent bits of the quadruple-precision floating-point value 810| `a'. 811*----------------------------------------------------------------------------*/ 812 813INLINE int32 extractFloat128Exp( float128 a ) 814{ 815 816 return ( a.high>>48 ) & 0x7FFF; 817 818} 819 820/*---------------------------------------------------------------------------- 821| Returns the sign bit of the quadruple-precision floating-point value `a'. 822*----------------------------------------------------------------------------*/ 823 824INLINE flag extractFloat128Sign( float128 a ) 825{ 826 827 return a.high>>63; 828 829} 830 831/*---------------------------------------------------------------------------- 832| Normalizes the subnormal quadruple-precision floating-point value 833| represented by the denormalized significand formed by the concatenation of 834| `aSig0' and `aSig1'. The normalized exponent is stored at the location 835| pointed to by `zExpPtr'. The most significant 49 bits of the normalized 836| significand are stored at the location pointed to by `zSig0Ptr', and the 837| least significant 64 bits of the normalized significand are stored at the 838| location pointed to by `zSig1Ptr'. 839*----------------------------------------------------------------------------*/ 840 841static void 842 normalizeFloat128Subnormal( 843 bits64 aSig0, 844 bits64 aSig1, 845 int32 *zExpPtr, 846 bits64 *zSig0Ptr, 847 bits64 *zSig1Ptr 848 ) 849{ 850 int8 shiftCount; 851 852 if ( aSig0 == 0 ) { 853 shiftCount = countLeadingZeros64( aSig1 ) - 15; 854 if ( shiftCount < 0 ) { 855 *zSig0Ptr = aSig1>>( - shiftCount ); 856 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 857 } 858 else { 859 *zSig0Ptr = aSig1<<shiftCount; 860 *zSig1Ptr = 0; 861 } 862 *zExpPtr = - shiftCount - 63; 863 } 864 else { 865 shiftCount = countLeadingZeros64( aSig0 ) - 15; 866 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 867 *zExpPtr = 1 - shiftCount; 868 } 869 870} 871 872/*---------------------------------------------------------------------------- 873| Packs the sign `zSign', the exponent `zExp', and the significand formed 874| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 875| floating-point value, returning the result. After being shifted into the 876| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 877| added together to form the most significant 32 bits of the result. This 878| means that any integer portion of `zSig0' will be added into the exponent. 879| Since a properly normalized significand will have an integer portion equal 880| to 1, the `zExp' input should be 1 less than the desired result exponent 881| whenever `zSig0' and `zSig1' concatenated form a complete, normalized 882| significand. 883*----------------------------------------------------------------------------*/ 884 885INLINE float128 886 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 887{ 888 float128 z; 889 890 z.low = zSig1; 891 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 892 return z; 893 894} 895 896/*---------------------------------------------------------------------------- 897| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 898| and extended significand formed by the concatenation of `zSig0', `zSig1', 899| and `zSig2', and returns the proper quadruple-precision floating-point value 900| corresponding to the abstract input. Ordinarily, the abstract value is 901| simply rounded and packed into the quadruple-precision format, with the 902| inexact exception raised if the abstract input cannot be represented 903| exactly. However, if the abstract value is too large, the overflow and 904| inexact exceptions are raised and an infinity or maximal finite value is 905| returned. If the abstract value is too small, the input value is rounded to 906| a subnormal number, and the underflow and inexact exceptions are raised if 907| the abstract input cannot be represented exactly as a subnormal quadruple- 908| precision floating-point number. 909| The input significand must be normalized or smaller. If the input 910| significand is not normalized, `zExp' must be 0; in that case, the result 911| returned is a subnormal number, and it must not require rounding. In the 912| usual case that the input significand is normalized, `zExp' must be 1 less 913| than the ``true'' floating-point exponent. The handling of underflow and 914| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 915*----------------------------------------------------------------------------*/ 916 917static float128 918 roundAndPackFloat128( 919 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM) 920{ 921 int8 roundingMode; 922 flag roundNearestEven, increment, isTiny; 923 924 roundingMode = STATUS(float_rounding_mode); 925 roundNearestEven = ( roundingMode == float_round_nearest_even ); 926 increment = ( (sbits64) zSig2 < 0 ); 927 if ( ! roundNearestEven ) { 928 if ( roundingMode == float_round_to_zero ) { 929 increment = 0; 930 } 931 else { 932 if ( zSign ) { 933 increment = ( roundingMode == float_round_down ) && zSig2; 934 } 935 else { 936 increment = ( roundingMode == float_round_up ) && zSig2; 937 } 938 } 939 } 940 if ( 0x7FFD <= (bits32) zExp ) { 941 if ( ( 0x7FFD < zExp ) 942 || ( ( zExp == 0x7FFD ) 943 && eq128( 944 LIT64( 0x0001FFFFFFFFFFFF ), 945 LIT64( 0xFFFFFFFFFFFFFFFF ), 946 zSig0, 947 zSig1 948 ) 949 && increment 950 ) 951 ) { 952 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 953 if ( ( roundingMode == float_round_to_zero ) 954 || ( zSign && ( roundingMode == float_round_up ) ) 955 || ( ! zSign && ( roundingMode == float_round_down ) ) 956 ) { 957 return 958 packFloat128( 959 zSign, 960 0x7FFE, 961 LIT64( 0x0000FFFFFFFFFFFF ), 962 LIT64( 0xFFFFFFFFFFFFFFFF ) 963 ); 964 } 965 return packFloat128( zSign, 0x7FFF, 0, 0 ); 966 } 967 if ( zExp < 0 ) { 968 isTiny = 969 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 970 || ( zExp < -1 ) 971 || ! increment 972 || lt128( 973 zSig0, 974 zSig1, 975 LIT64( 0x0001FFFFFFFFFFFF ), 976 LIT64( 0xFFFFFFFFFFFFFFFF ) 977 ); 978 shift128ExtraRightJamming( 979 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 980 zExp = 0; 981 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 982 if ( roundNearestEven ) { 983 increment = ( (sbits64) zSig2 < 0 ); 984 } 985 else { 986 if ( zSign ) { 987 increment = ( roundingMode == float_round_down ) && zSig2; 988 } 989 else { 990 increment = ( roundingMode == float_round_up ) && zSig2; 991 } 992 } 993 } 994 } 995 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 996 if ( increment ) { 997 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 998 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 999 } 1000 else { 1001 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1002 } 1003 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1004 1005} 1006 1007/*---------------------------------------------------------------------------- 1008| Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1009| and significand formed by the concatenation of `zSig0' and `zSig1', and 1010| returns the proper quadruple-precision floating-point value corresponding 1011| to the abstract input. This routine is just like `roundAndPackFloat128' 1012| except that the input significand has fewer bits and does not have to be 1013| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1014| point exponent. 1015*----------------------------------------------------------------------------*/ 1016 1017static float128 1018 normalizeRoundAndPackFloat128( 1019 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM) 1020{ 1021 int8 shiftCount; 1022 bits64 zSig2; 1023 1024 if ( zSig0 == 0 ) { 1025 zSig0 = zSig1; 1026 zSig1 = 0; 1027 zExp -= 64; 1028 } 1029 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1030 if ( 0 <= shiftCount ) { 1031 zSig2 = 0; 1032 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1033 } 1034 else { 1035 shift128ExtraRightJamming( 1036 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1037 } 1038 zExp -= shiftCount; 1039 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1040 1041} 1042 1043#endif 1044 1045/*---------------------------------------------------------------------------- 1046| Returns the result of converting the 32-bit two's complement integer `a' 1047| to the single-precision floating-point format. The conversion is performed 1048| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1049*----------------------------------------------------------------------------*/ 1050 1051float32 int32_to_float32( int32 a STATUS_PARAM ) 1052{ 1053 flag zSign; 1054 1055 if ( a == 0 ) return float32_zero; 1056 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1057 zSign = ( a < 0 ); 1058 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1059 1060} 1061 1062/*---------------------------------------------------------------------------- 1063| Returns the result of converting the 32-bit two's complement integer `a' 1064| to the double-precision floating-point format. The conversion is performed 1065| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1066*----------------------------------------------------------------------------*/ 1067 1068float64 int32_to_float64( int32 a STATUS_PARAM ) 1069{ 1070 flag zSign; 1071 uint32 absA; 1072 int8 shiftCount; 1073 bits64 zSig; 1074 1075 if ( a == 0 ) return float64_zero; 1076 zSign = ( a < 0 ); 1077 absA = zSign ? - a : a; 1078 shiftCount = countLeadingZeros32( absA ) + 21; 1079 zSig = absA; 1080 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1081 1082} 1083 1084#ifdef FLOATX80 1085 1086/*---------------------------------------------------------------------------- 1087| Returns the result of converting the 32-bit two's complement integer `a' 1088| to the extended double-precision floating-point format. The conversion 1089| is performed according to the IEC/IEEE Standard for Binary Floating-Point 1090| Arithmetic. 1091*----------------------------------------------------------------------------*/ 1092 1093floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) 1094{ 1095 flag zSign; 1096 uint32 absA; 1097 int8 shiftCount; 1098 bits64 zSig; 1099 1100 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1101 zSign = ( a < 0 ); 1102 absA = zSign ? - a : a; 1103 shiftCount = countLeadingZeros32( absA ) + 32; 1104 zSig = absA; 1105 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1106 1107} 1108 1109#endif 1110 1111#ifdef FLOAT128 1112 1113/*---------------------------------------------------------------------------- 1114| Returns the result of converting the 32-bit two's complement integer `a' to 1115| the quadruple-precision floating-point format. The conversion is performed 1116| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1117*----------------------------------------------------------------------------*/ 1118 1119float128 int32_to_float128( int32 a STATUS_PARAM ) 1120{ 1121 flag zSign; 1122 uint32 absA; 1123 int8 shiftCount; 1124 bits64 zSig0; 1125 1126 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1127 zSign = ( a < 0 ); 1128 absA = zSign ? - a : a; 1129 shiftCount = countLeadingZeros32( absA ) + 17; 1130 zSig0 = absA; 1131 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1132 1133} 1134 1135#endif 1136 1137/*---------------------------------------------------------------------------- 1138| Returns the result of converting the 64-bit two's complement integer `a' 1139| to the single-precision floating-point format. The conversion is performed 1140| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1141*----------------------------------------------------------------------------*/ 1142 1143float32 int64_to_float32( int64 a STATUS_PARAM ) 1144{ 1145 flag zSign; 1146 uint64 absA; 1147 int8 shiftCount; 1148 1149 if ( a == 0 ) return float32_zero; 1150 zSign = ( a < 0 ); 1151 absA = zSign ? - a : a; 1152 shiftCount = countLeadingZeros64( absA ) - 40; 1153 if ( 0 <= shiftCount ) { 1154 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1155 } 1156 else { 1157 shiftCount += 7; 1158 if ( shiftCount < 0 ) { 1159 shift64RightJamming( absA, - shiftCount, &absA ); 1160 } 1161 else { 1162 absA <<= shiftCount; 1163 } 1164 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1165 } 1166 1167} 1168 1169float32 uint64_to_float32( uint64 a STATUS_PARAM ) 1170{ 1171 int8 shiftCount; 1172 1173 if ( a == 0 ) return float32_zero; 1174 shiftCount = countLeadingZeros64( a ) - 40; 1175 if ( 0 <= shiftCount ) { 1176 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); 1177 } 1178 else { 1179 shiftCount += 7; 1180 if ( shiftCount < 0 ) { 1181 shift64RightJamming( a, - shiftCount, &a ); 1182 } 1183 else { 1184 a <<= shiftCount; 1185 } 1186 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); 1187 } 1188} 1189 1190/*---------------------------------------------------------------------------- 1191| Returns the result of converting the 64-bit two's complement integer `a' 1192| to the double-precision floating-point format. The conversion is performed 1193| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1194*----------------------------------------------------------------------------*/ 1195 1196float64 int64_to_float64( int64 a STATUS_PARAM ) 1197{ 1198 flag zSign; 1199 1200 if ( a == 0 ) return float64_zero; 1201 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1202 return packFloat64( 1, 0x43E, 0 ); 1203 } 1204 zSign = ( a < 0 ); 1205 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1206 1207} 1208 1209float64 uint64_to_float64( uint64 a STATUS_PARAM ) 1210{ 1211 if ( a == 0 ) return float64_zero; 1212 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); 1213 1214} 1215 1216#ifdef FLOATX80 1217 1218/*---------------------------------------------------------------------------- 1219| Returns the result of converting the 64-bit two's complement integer `a' 1220| to the extended double-precision floating-point format. The conversion 1221| is performed according to the IEC/IEEE Standard for Binary Floating-Point 1222| Arithmetic. 1223*----------------------------------------------------------------------------*/ 1224 1225floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) 1226{ 1227 flag zSign; 1228 uint64 absA; 1229 int8 shiftCount; 1230 1231 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1232 zSign = ( a < 0 ); 1233 absA = zSign ? - a : a; 1234 shiftCount = countLeadingZeros64( absA ); 1235 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1236 1237} 1238 1239#endif 1240 1241#ifdef FLOAT128 1242 1243/*---------------------------------------------------------------------------- 1244| Returns the result of converting the 64-bit two's complement integer `a' to 1245| the quadruple-precision floating-point format. The conversion is performed 1246| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1247*----------------------------------------------------------------------------*/ 1248 1249float128 int64_to_float128( int64 a STATUS_PARAM ) 1250{ 1251 flag zSign; 1252 uint64 absA; 1253 int8 shiftCount; 1254 int32 zExp; 1255 bits64 zSig0, zSig1; 1256 1257 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1258 zSign = ( a < 0 ); 1259 absA = zSign ? - a : a; 1260 shiftCount = countLeadingZeros64( absA ) + 49; 1261 zExp = 0x406E - shiftCount; 1262 if ( 64 <= shiftCount ) { 1263 zSig1 = 0; 1264 zSig0 = absA; 1265 shiftCount -= 64; 1266 } 1267 else { 1268 zSig1 = absA; 1269 zSig0 = 0; 1270 } 1271 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1272 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1273 1274} 1275 1276#endif 1277 1278/*---------------------------------------------------------------------------- 1279| Returns the result of converting the single-precision floating-point value 1280| `a' to the 32-bit two's complement integer format. The conversion is 1281| performed according to the IEC/IEEE Standard for Binary Floating-Point 1282| Arithmetic---which means in particular that the conversion is rounded 1283| according to the current rounding mode. If `a' is a NaN, the largest 1284| positive integer is returned. Otherwise, if the conversion overflows, the 1285| largest integer with the same sign as `a' is returned. 1286*----------------------------------------------------------------------------*/ 1287 1288int32 float32_to_int32( float32 a STATUS_PARAM ) 1289{ 1290 flag aSign; 1291 int16 aExp, shiftCount; 1292 bits32 aSig; 1293 bits64 aSig64; 1294 1295 aSig = extractFloat32Frac( a ); 1296 aExp = extractFloat32Exp( a ); 1297 aSign = extractFloat32Sign( a ); 1298 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1299 if ( aExp ) aSig |= 0x00800000; 1300 shiftCount = 0xAF - aExp; 1301 aSig64 = aSig; 1302 aSig64 <<= 32; 1303 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1304 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1305 1306} 1307 1308/*---------------------------------------------------------------------------- 1309| Returns the result of converting the single-precision floating-point value 1310| `a' to the 32-bit two's complement integer format. The conversion is 1311| performed according to the IEC/IEEE Standard for Binary Floating-Point 1312| Arithmetic, except that the conversion is always rounded toward zero. 1313| If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1314| the conversion overflows, the largest integer with the same sign as `a' is 1315| returned. 1316*----------------------------------------------------------------------------*/ 1317 1318int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1319{ 1320 flag aSign; 1321 int16 aExp, shiftCount; 1322 bits32 aSig; 1323 int32 z; 1324 1325 aSig = extractFloat32Frac( a ); 1326 aExp = extractFloat32Exp( a ); 1327 aSign = extractFloat32Sign( a ); 1328 shiftCount = aExp - 0x9E; 1329 if ( 0 <= shiftCount ) { 1330 if ( float32_val(a) != 0xCF000000 ) { 1331 float_raise( float_flag_invalid STATUS_VAR); 1332 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1333 } 1334 return (sbits32) 0x80000000; 1335 } 1336 else if ( aExp <= 0x7E ) { 1337 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1338 return 0; 1339 } 1340 aSig = ( aSig | 0x00800000 )<<8; 1341 z = aSig>>( - shiftCount ); 1342 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1343 STATUS(float_exception_flags) |= float_flag_inexact; 1344 } 1345 if ( aSign ) z = - z; 1346 return z; 1347 1348} 1349 1350/*---------------------------------------------------------------------------- 1351| Returns the result of converting the single-precision floating-point value 1352| `a' to the 64-bit two's complement integer format. The conversion is 1353| performed according to the IEC/IEEE Standard for Binary Floating-Point 1354| Arithmetic---which means in particular that the conversion is rounded 1355| according to the current rounding mode. If `a' is a NaN, the largest 1356| positive integer is returned. Otherwise, if the conversion overflows, the 1357| largest integer with the same sign as `a' is returned. 1358*----------------------------------------------------------------------------*/ 1359 1360int64 float32_to_int64( float32 a STATUS_PARAM ) 1361{ 1362 flag aSign; 1363 int16 aExp, shiftCount; 1364 bits32 aSig; 1365 bits64 aSig64, aSigExtra; 1366 1367 aSig = extractFloat32Frac( a ); 1368 aExp = extractFloat32Exp( a ); 1369 aSign = extractFloat32Sign( a ); 1370 shiftCount = 0xBE - aExp; 1371 if ( shiftCount < 0 ) { 1372 float_raise( float_flag_invalid STATUS_VAR); 1373 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1374 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1375 } 1376 return (sbits64) LIT64( 0x8000000000000000 ); 1377 } 1378 if ( aExp ) aSig |= 0x00800000; 1379 aSig64 = aSig; 1380 aSig64 <<= 40; 1381 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1382 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1383 1384} 1385 1386/*---------------------------------------------------------------------------- 1387| Returns the result of converting the single-precision floating-point value 1388| `a' to the 64-bit two's complement integer format. The conversion is 1389| performed according to the IEC/IEEE Standard for Binary Floating-Point 1390| Arithmetic, except that the conversion is always rounded toward zero. If 1391| `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1392| conversion overflows, the largest integer with the same sign as `a' is 1393| returned. 1394*----------------------------------------------------------------------------*/ 1395 1396int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1397{ 1398 flag aSign; 1399 int16 aExp, shiftCount; 1400 bits32 aSig; 1401 bits64 aSig64; 1402 int64 z; 1403 1404 aSig = extractFloat32Frac( a ); 1405 aExp = extractFloat32Exp( a ); 1406 aSign = extractFloat32Sign( a ); 1407 shiftCount = aExp - 0xBE; 1408 if ( 0 <= shiftCount ) { 1409 if ( float32_val(a) != 0xDF000000 ) { 1410 float_raise( float_flag_invalid STATUS_VAR); 1411 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1412 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1413 } 1414 } 1415 return (sbits64) LIT64( 0x8000000000000000 ); 1416 } 1417 else if ( aExp <= 0x7E ) { 1418 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1419 return 0; 1420 } 1421 aSig64 = aSig | 0x00800000; 1422 aSig64 <<= 40; 1423 z = aSig64>>( - shiftCount ); 1424 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1425 STATUS(float_exception_flags) |= float_flag_inexact; 1426 } 1427 if ( aSign ) z = - z; 1428 return z; 1429 1430} 1431 1432/*---------------------------------------------------------------------------- 1433| Returns the result of converting the single-precision floating-point value 1434| `a' to the double-precision floating-point format. The conversion is 1435| performed according to the IEC/IEEE Standard for Binary Floating-Point 1436| Arithmetic. 1437*----------------------------------------------------------------------------*/ 1438 1439float64 float32_to_float64( float32 a STATUS_PARAM ) 1440{ 1441 flag aSign; 1442 int16 aExp; 1443 bits32 aSig; 1444 1445 aSig = extractFloat32Frac( a ); 1446 aExp = extractFloat32Exp( a ); 1447 aSign = extractFloat32Sign( a ); 1448 if ( aExp == 0xFF ) { 1449 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR )); 1450 return packFloat64( aSign, 0x7FF, 0 ); 1451 } 1452 if ( aExp == 0 ) { 1453 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1454 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1455 --aExp; 1456 } 1457 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1458 1459} 1460 1461#ifdef FLOATX80 1462 1463/*---------------------------------------------------------------------------- 1464| Returns the result of converting the single-precision floating-point value 1465| `a' to the extended double-precision floating-point format. The conversion 1466| is performed according to the IEC/IEEE Standard for Binary Floating-Point 1467| Arithmetic. 1468*----------------------------------------------------------------------------*/ 1469 1470floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1471{ 1472 flag aSign; 1473 int16 aExp; 1474 bits32 aSig; 1475 1476 aSig = extractFloat32Frac( a ); 1477 aExp = extractFloat32Exp( a ); 1478 aSign = extractFloat32Sign( a ); 1479 if ( aExp == 0xFF ) { 1480 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) ); 1481 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1482 } 1483 if ( aExp == 0 ) { 1484 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1485 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1486 } 1487 aSig |= 0x00800000; 1488 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1489 1490} 1491 1492#endif 1493 1494#ifdef FLOAT128 1495 1496/*---------------------------------------------------------------------------- 1497| Returns the result of converting the single-precision floating-point value 1498| `a' to the double-precision floating-point format. The conversion is 1499| performed according to the IEC/IEEE Standard for Binary Floating-Point 1500| Arithmetic. 1501*----------------------------------------------------------------------------*/ 1502 1503float128 float32_to_float128( float32 a STATUS_PARAM ) 1504{ 1505 flag aSign; 1506 int16 aExp; 1507 bits32 aSig; 1508 1509 aSig = extractFloat32Frac( a ); 1510 aExp = extractFloat32Exp( a ); 1511 aSign = extractFloat32Sign( a ); 1512 if ( aExp == 0xFF ) { 1513 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) ); 1514 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1515 } 1516 if ( aExp == 0 ) { 1517 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1518 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1519 --aExp; 1520 } 1521 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1522 1523} 1524 1525#endif 1526 1527/*---------------------------------------------------------------------------- 1528| Rounds the single-precision floating-point value `a' to an integer, and 1529| returns the result as a single-precision floating-point value. The 1530| operation is performed according to the IEC/IEEE Standard for Binary 1531| Floating-Point Arithmetic. 1532*----------------------------------------------------------------------------*/ 1533 1534float32 float32_round_to_int( float32 a STATUS_PARAM) 1535{ 1536 flag aSign; 1537 int16 aExp; 1538 bits32 lastBitMask, roundBitsMask; 1539 int8 roundingMode; 1540 bits32 z; 1541 1542 aExp = extractFloat32Exp( a ); 1543 if ( 0x96 <= aExp ) { 1544 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1545 return propagateFloat32NaN( a, a STATUS_VAR ); 1546 } 1547 return a; 1548 } 1549 if ( aExp <= 0x7E ) { 1550 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a; 1551 STATUS(float_exception_flags) |= float_flag_inexact; 1552 aSign = extractFloat32Sign( a ); 1553 switch ( STATUS(float_rounding_mode) ) { 1554 case float_round_nearest_even: 1555 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1556 return packFloat32( aSign, 0x7F, 0 ); 1557 } 1558 break; 1559 case float_round_down: 1560 return make_float32(aSign ? 0xBF800000 : 0); 1561 case float_round_up: 1562 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1563 } 1564 return packFloat32( aSign, 0, 0 ); 1565 } 1566 lastBitMask = 1; 1567 lastBitMask <<= 0x96 - aExp; 1568 roundBitsMask = lastBitMask - 1; 1569 z = float32_val(a); 1570 roundingMode = STATUS(float_rounding_mode); 1571 if ( roundingMode == float_round_nearest_even ) { 1572 z += lastBitMask>>1; 1573 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1574 } 1575 else if ( roundingMode != float_round_to_zero ) { 1576 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { 1577 z += roundBitsMask; 1578 } 1579 } 1580 z &= ~ roundBitsMask; 1581 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1582 return make_float32(z); 1583 1584} 1585 1586/*---------------------------------------------------------------------------- 1587| Returns the result of adding the absolute values of the single-precision 1588| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1589| before being returned. `zSign' is ignored if the result is a NaN. 1590| The addition is performed according to the IEC/IEEE Standard for Binary 1591| Floating-Point Arithmetic. 1592*----------------------------------------------------------------------------*/ 1593 1594static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1595{ 1596 int16 aExp, bExp, zExp; 1597 bits32 aSig, bSig, zSig; 1598 int16 expDiff; 1599 1600 aSig = extractFloat32Frac( a ); 1601 aExp = extractFloat32Exp( a ); 1602 bSig = extractFloat32Frac( b ); 1603 bExp = extractFloat32Exp( b ); 1604 expDiff = aExp - bExp; 1605 aSig <<= 6; 1606 bSig <<= 6; 1607 if ( 0 < expDiff ) { 1608 if ( aExp == 0xFF ) { 1609 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1610 return a; 1611 } 1612 if ( bExp == 0 ) { 1613 --expDiff; 1614 } 1615 else { 1616 bSig |= 0x20000000; 1617 } 1618 shift32RightJamming( bSig, expDiff, &bSig ); 1619 zExp = aExp; 1620 } 1621 else if ( expDiff < 0 ) { 1622 if ( bExp == 0xFF ) { 1623 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1624 return packFloat32( zSign, 0xFF, 0 ); 1625 } 1626 if ( aExp == 0 ) { 1627 ++expDiff; 1628 } 1629 else { 1630 aSig |= 0x20000000; 1631 } 1632 shift32RightJamming( aSig, - expDiff, &aSig ); 1633 zExp = bExp; 1634 } 1635 else { 1636 if ( aExp == 0xFF ) { 1637 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1638 return a; 1639 } 1640 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1641 zSig = 0x40000000 + aSig + bSig; 1642 zExp = aExp; 1643 goto roundAndPack; 1644 } 1645 aSig |= 0x20000000; 1646 zSig = ( aSig + bSig )<<1; 1647 --zExp; 1648 if ( (sbits32) zSig < 0 ) { 1649 zSig = aSig + bSig; 1650 ++zExp; 1651 } 1652 roundAndPack: 1653 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1654 1655} 1656 1657/*---------------------------------------------------------------------------- 1658| Returns the result of subtracting the absolute values of the single- 1659| precision floating-point values `a' and `b'. If `zSign' is 1, the 1660| difference is negated before being returned. `zSign' is ignored if the 1661| result is a NaN. The subtraction is performed according to the IEC/IEEE 1662| Standard for Binary Floating-Point Arithmetic. 1663*----------------------------------------------------------------------------*/ 1664 1665static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1666{ 1667 int16 aExp, bExp, zExp; 1668 bits32 aSig, bSig, zSig; 1669 int16 expDiff; 1670 1671 aSig = extractFloat32Frac( a ); 1672 aExp = extractFloat32Exp( a ); 1673 bSig = extractFloat32Frac( b ); 1674 bExp = extractFloat32Exp( b ); 1675 expDiff = aExp - bExp; 1676 aSig <<= 7; 1677 bSig <<= 7; 1678 if ( 0 < expDiff ) goto aExpBigger; 1679 if ( expDiff < 0 ) goto bExpBigger; 1680 if ( aExp == 0xFF ) { 1681 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1682 float_raise( float_flag_invalid STATUS_VAR); 1683 return float32_default_nan; 1684 } 1685 if ( aExp == 0 ) { 1686 aExp = 1; 1687 bExp = 1; 1688 } 1689 if ( bSig < aSig ) goto aBigger; 1690 if ( aSig < bSig ) goto bBigger; 1691 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1692 bExpBigger: 1693 if ( bExp == 0xFF ) { 1694 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1695 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1696 } 1697 if ( aExp == 0 ) { 1698 ++expDiff; 1699 } 1700 else { 1701 aSig |= 0x40000000; 1702 } 1703 shift32RightJamming( aSig, - expDiff, &aSig ); 1704 bSig |= 0x40000000; 1705 bBigger: 1706 zSig = bSig - aSig; 1707 zExp = bExp; 1708 zSign ^= 1; 1709 goto normalizeRoundAndPack; 1710 aExpBigger: 1711 if ( aExp == 0xFF ) { 1712 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1713 return a; 1714 } 1715 if ( bExp == 0 ) { 1716 --expDiff; 1717 } 1718 else { 1719 bSig |= 0x40000000; 1720 } 1721 shift32RightJamming( bSig, expDiff, &bSig ); 1722 aSig |= 0x40000000; 1723 aBigger: 1724 zSig = aSig - bSig; 1725 zExp = aExp; 1726 normalizeRoundAndPack: 1727 --zExp; 1728 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1729 1730} 1731 1732/*---------------------------------------------------------------------------- 1733| Returns the result of adding the single-precision floating-point values `a' 1734| and `b'. The operation is performed according to the IEC/IEEE Standard for 1735| Binary Floating-Point Arithmetic. 1736*----------------------------------------------------------------------------*/ 1737 1738float32 float32_add( float32 a, float32 b STATUS_PARAM ) 1739{ 1740 flag aSign, bSign; 1741 1742 aSign = extractFloat32Sign( a ); 1743 bSign = extractFloat32Sign( b ); 1744 if ( aSign == bSign ) { 1745 return addFloat32Sigs( a, b, aSign STATUS_VAR); 1746 } 1747 else { 1748 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1749 } 1750 1751} 1752 1753/*---------------------------------------------------------------------------- 1754| Returns the result of subtracting the single-precision floating-point values 1755| `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1756| for Binary Floating-Point Arithmetic. 1757*----------------------------------------------------------------------------*/ 1758 1759float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 1760{ 1761 flag aSign, bSign; 1762 1763 aSign = extractFloat32Sign( a ); 1764 bSign = extractFloat32Sign( b ); 1765 if ( aSign == bSign ) { 1766 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1767 } 1768 else { 1769 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 1770 } 1771 1772} 1773 1774/*---------------------------------------------------------------------------- 1775| Returns the result of multiplying the single-precision floating-point values 1776| `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1777| for Binary Floating-Point Arithmetic. 1778*----------------------------------------------------------------------------*/ 1779 1780float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 1781{ 1782 flag aSign, bSign, zSign; 1783 int16 aExp, bExp, zExp; 1784 bits32 aSig, bSig; 1785 bits64 zSig64; 1786 bits32 zSig; 1787 1788 aSig = extractFloat32Frac( a ); 1789 aExp = extractFloat32Exp( a ); 1790 aSign = extractFloat32Sign( a ); 1791 bSig = extractFloat32Frac( b ); 1792 bExp = extractFloat32Exp( b ); 1793 bSign = extractFloat32Sign( b ); 1794 zSign = aSign ^ bSign; 1795 if ( aExp == 0xFF ) { 1796 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1797 return propagateFloat32NaN( a, b STATUS_VAR ); 1798 } 1799 if ( ( bExp | bSig ) == 0 ) { 1800 float_raise( float_flag_invalid STATUS_VAR); 1801 return float32_default_nan; 1802 } 1803 return packFloat32( zSign, 0xFF, 0 ); 1804 } 1805 if ( bExp == 0xFF ) { 1806 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1807 if ( ( aExp | aSig ) == 0 ) { 1808 float_raise( float_flag_invalid STATUS_VAR); 1809 return float32_default_nan; 1810 } 1811 return packFloat32( zSign, 0xFF, 0 ); 1812 } 1813 if ( aExp == 0 ) { 1814 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1815 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1816 } 1817 if ( bExp == 0 ) { 1818 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1819 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1820 } 1821 zExp = aExp + bExp - 0x7F; 1822 aSig = ( aSig | 0x00800000 )<<7; 1823 bSig = ( bSig | 0x00800000 )<<8; 1824 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1825 zSig = zSig64; 1826 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1827 zSig <<= 1; 1828 --zExp; 1829 } 1830 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1831 1832} 1833 1834/*---------------------------------------------------------------------------- 1835| Returns the result of dividing the single-precision floating-point value `a' 1836| by the corresponding value `b'. The operation is performed according to the 1837| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1838*----------------------------------------------------------------------------*/ 1839 1840float32 float32_div( float32 a, float32 b STATUS_PARAM ) 1841{ 1842 flag aSign, bSign, zSign; 1843 int16 aExp, bExp, zExp; 1844 bits32 aSig, bSig, zSig; 1845 1846 aSig = extractFloat32Frac( a ); 1847 aExp = extractFloat32Exp( a ); 1848 aSign = extractFloat32Sign( a ); 1849 bSig = extractFloat32Frac( b ); 1850 bExp = extractFloat32Exp( b ); 1851 bSign = extractFloat32Sign( b ); 1852 zSign = aSign ^ bSign; 1853 if ( aExp == 0xFF ) { 1854 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1855 if ( bExp == 0xFF ) { 1856 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1857 float_raise( float_flag_invalid STATUS_VAR); 1858 return float32_default_nan; 1859 } 1860 return packFloat32( zSign, 0xFF, 0 ); 1861 } 1862 if ( bExp == 0xFF ) { 1863 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1864 return packFloat32( zSign, 0, 0 ); 1865 } 1866 if ( bExp == 0 ) { 1867 if ( bSig == 0 ) { 1868 if ( ( aExp | aSig ) == 0 ) { 1869 float_raise( float_flag_invalid STATUS_VAR); 1870 return float32_default_nan; 1871 } 1872 float_raise( float_flag_divbyzero STATUS_VAR); 1873 return packFloat32( zSign, 0xFF, 0 ); 1874 } 1875 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1876 } 1877 if ( aExp == 0 ) { 1878 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1879 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1880 } 1881 zExp = aExp - bExp + 0x7D; 1882 aSig = ( aSig | 0x00800000 )<<7; 1883 bSig = ( bSig | 0x00800000 )<<8; 1884 if ( bSig <= ( aSig + aSig ) ) { 1885 aSig >>= 1; 1886 ++zExp; 1887 } 1888 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1889 if ( ( zSig & 0x3F ) == 0 ) { 1890 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1891 } 1892 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1893 1894} 1895 1896/*---------------------------------------------------------------------------- 1897| Returns the remainder of the single-precision floating-point value `a' 1898| with respect to the corresponding value `b'. The operation is performed 1899| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1900*----------------------------------------------------------------------------*/ 1901 1902float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 1903{ 1904 flag aSign, bSign, zSign; 1905 int16 aExp, bExp, expDiff; 1906 bits32 aSig, bSig; 1907 bits32 q; 1908 bits64 aSig64, bSig64, q64; 1909 bits32 alternateASig; 1910 sbits32 sigMean; 1911 1912 aSig = extractFloat32Frac( a ); 1913 aExp = extractFloat32Exp( a ); 1914 aSign = extractFloat32Sign( a ); 1915 bSig = extractFloat32Frac( b ); 1916 bExp = extractFloat32Exp( b ); 1917 bSign = extractFloat32Sign( b ); 1918 if ( aExp == 0xFF ) { 1919 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1920 return propagateFloat32NaN( a, b STATUS_VAR ); 1921 } 1922 float_raise( float_flag_invalid STATUS_VAR); 1923 return float32_default_nan; 1924 } 1925 if ( bExp == 0xFF ) { 1926 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1927 return a; 1928 } 1929 if ( bExp == 0 ) { 1930 if ( bSig == 0 ) { 1931 float_raise( float_flag_invalid STATUS_VAR); 1932 return float32_default_nan; 1933 } 1934 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1935 } 1936 if ( aExp == 0 ) { 1937 if ( aSig == 0 ) return a; 1938 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1939 } 1940 expDiff = aExp - bExp; 1941 aSig |= 0x00800000; 1942 bSig |= 0x00800000; 1943 if ( expDiff < 32 ) { 1944 aSig <<= 8; 1945 bSig <<= 8; 1946 if ( expDiff < 0 ) { 1947 if ( expDiff < -1 ) return a; 1948 aSig >>= 1; 1949 } 1950 q = ( bSig <= aSig ); 1951 if ( q ) aSig -= bSig; 1952 if ( 0 < expDiff ) { 1953 q = ( ( (bits64) aSig )<<32 ) / bSig; 1954 q >>= 32 - expDiff; 1955 bSig >>= 2; 1956 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1957 } 1958 else { 1959 aSig >>= 2; 1960 bSig >>= 2; 1961 } 1962 } 1963 else { 1964 if ( bSig <= aSig ) aSig -= bSig; 1965 aSig64 = ( (bits64) aSig )<<40; 1966 bSig64 = ( (bits64) bSig )<<40; 1967 expDiff -= 64; 1968 while ( 0 < expDiff ) { 1969 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1970 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1971 aSig64 = - ( ( bSig * q64 )<<38 ); 1972 expDiff -= 62; 1973 } 1974 expDiff += 64; 1975 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1976 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1977 q = q64>>( 64 - expDiff ); 1978 bSig <<= 6; 1979 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 1980 } 1981 do { 1982 alternateASig = aSig; 1983 ++q; 1984 aSig -= bSig; 1985 } while ( 0 <= (sbits32) aSig ); 1986 sigMean = aSig + alternateASig; 1987 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1988 aSig = alternateASig; 1989 } 1990 zSign = ( (sbits32) aSig < 0 ); 1991 if ( zSign ) aSig = - aSig; 1992 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 1993 1994} 1995 1996/*---------------------------------------------------------------------------- 1997| Returns the square root of the single-precision floating-point value `a'. 1998| The operation is performed according to the IEC/IEEE Standard for Binary 1999| Floating-Point Arithmetic. 2000*----------------------------------------------------------------------------*/ 2001 2002float32 float32_sqrt( float32 a STATUS_PARAM ) 2003{ 2004 flag aSign; 2005 int16 aExp, zExp; 2006 bits32 aSig, zSig; 2007 bits64 rem, term; 2008 2009 aSig = extractFloat32Frac( a ); 2010 aExp = extractFloat32Exp( a ); 2011 aSign = extractFloat32Sign( a ); 2012 if ( aExp == 0xFF ) { 2013 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2014 if ( ! aSign ) return a; 2015 float_raise( float_flag_invalid STATUS_VAR); 2016 return float32_default_nan; 2017 } 2018 if ( aSign ) { 2019 if ( ( aExp | aSig ) == 0 ) return a; 2020 float_raise( float_flag_invalid STATUS_VAR); 2021 return float32_default_nan; 2022 } 2023 if ( aExp == 0 ) { 2024 if ( aSig == 0 ) return float32_zero; 2025 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2026 } 2027 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2028 aSig = ( aSig | 0x00800000 )<<8; 2029 zSig = estimateSqrt32( aExp, aSig ) + 2; 2030 if ( ( zSig & 0x7F ) <= 5 ) { 2031 if ( zSig < 2 ) { 2032 zSig = 0x7FFFFFFF; 2033 goto roundAndPack; 2034 } 2035 aSig >>= aExp & 1; 2036 term = ( (bits64) zSig ) * zSig; 2037 rem = ( ( (bits64) aSig )<<32 ) - term; 2038 while ( (sbits64) rem < 0 ) { 2039 --zSig; 2040 rem += ( ( (bits64) zSig )<<1 ) | 1; 2041 } 2042 zSig |= ( rem != 0 ); 2043 } 2044 shift32RightJamming( zSig, 1, &zSig ); 2045 roundAndPack: 2046 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2047 2048} 2049 2050/*---------------------------------------------------------------------------- 2051| Returns 1 if the single-precision floating-point value `a' is equal to 2052| the corresponding value `b', and 0 otherwise. The comparison is performed 2053| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2054*----------------------------------------------------------------------------*/ 2055 2056int float32_eq( float32 a, float32 b STATUS_PARAM ) 2057{ 2058 2059 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2060 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2061 ) { 2062 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2063 float_raise( float_flag_invalid STATUS_VAR); 2064 } 2065 return 0; 2066 } 2067 return ( float32_val(a) == float32_val(b) ) || 2068 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2069 2070} 2071 2072/*---------------------------------------------------------------------------- 2073| Returns 1 if the single-precision floating-point value `a' is less than 2074| or equal to the corresponding value `b', and 0 otherwise. The comparison 2075| is performed according to the IEC/IEEE Standard for Binary Floating-Point 2076| Arithmetic. 2077*----------------------------------------------------------------------------*/ 2078 2079int float32_le( float32 a, float32 b STATUS_PARAM ) 2080{ 2081 flag aSign, bSign; 2082 bits32 av, bv; 2083 2084 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2085 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2086 ) { 2087 float_raise( float_flag_invalid STATUS_VAR); 2088 return 0; 2089 } 2090 aSign = extractFloat32Sign( a ); 2091 bSign = extractFloat32Sign( b ); 2092 av = float32_val(a); 2093 bv = float32_val(b); 2094 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2095 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2096 2097} 2098 2099/*---------------------------------------------------------------------------- 2100| Returns 1 if the single-precision floating-point value `a' is less than 2101| the corresponding value `b', and 0 otherwise. The comparison is performed 2102| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2103*----------------------------------------------------------------------------*/ 2104 2105int float32_lt( float32 a, float32 b STATUS_PARAM ) 2106{ 2107 flag aSign, bSign; 2108 bits32 av, bv; 2109 2110 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2111 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2112 ) { 2113 float_raise( float_flag_invalid STATUS_VAR); 2114 return 0; 2115 } 2116 aSign = extractFloat32Sign( a ); 2117 bSign = extractFloat32Sign( b ); 2118 av = float32_val(a); 2119 bv = float32_val(b); 2120 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2121 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2122 2123} 2124 2125/*---------------------------------------------------------------------------- 2126| Returns 1 if the single-precision floating-point value `a' is equal to 2127| the corresponding value `b', and 0 otherwise. The invalid exception is 2128| raised if either operand is a NaN. Otherwise, the comparison is performed 2129| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2130*----------------------------------------------------------------------------*/ 2131 2132int float32_eq_signaling( float32 a, float32 b STATUS_PARAM ) 2133{ 2134 bits32 av, bv; 2135 2136 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2137 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2138 ) { 2139 float_raise( float_flag_invalid STATUS_VAR); 2140 return 0; 2141 } 2142 av = float32_val(a); 2143 bv = float32_val(b); 2144 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2145 2146} 2147 2148/*---------------------------------------------------------------------------- 2149| Returns 1 if the single-precision floating-point value `a' is less than or 2150| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2151| cause an exception. Otherwise, the comparison is performed according to the 2152| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2153*----------------------------------------------------------------------------*/ 2154 2155int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2156{ 2157 flag aSign, bSign; 2158 bits32 av, bv; 2159 2160 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2161 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2162 ) { 2163 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2164 float_raise( float_flag_invalid STATUS_VAR); 2165 } 2166 return 0; 2167 } 2168 aSign = extractFloat32Sign( a ); 2169 bSign = extractFloat32Sign( b ); 2170 av = float32_val(a); 2171 bv = float32_val(b); 2172 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2173 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2174 2175} 2176 2177/*---------------------------------------------------------------------------- 2178| Returns 1 if the single-precision floating-point value `a' is less than 2179| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2180| exception. Otherwise, the comparison is performed according to the IEC/IEEE 2181| Standard for Binary Floating-Point Arithmetic. 2182*----------------------------------------------------------------------------*/ 2183 2184int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2185{ 2186 flag aSign, bSign; 2187 bits32 av, bv; 2188 2189 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2190 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2191 ) { 2192 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2193 float_raise( float_flag_invalid STATUS_VAR); 2194 } 2195 return 0; 2196 } 2197 aSign = extractFloat32Sign( a ); 2198 bSign = extractFloat32Sign( b ); 2199 av = float32_val(a); 2200 bv = float32_val(b); 2201 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2202 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2203 2204} 2205 2206/*---------------------------------------------------------------------------- 2207| Returns the result of converting the double-precision floating-point value 2208| `a' to the 32-bit two's complement integer format. The conversion is 2209| performed according to the IEC/IEEE Standard for Binary Floating-Point 2210| Arithmetic---which means in particular that the conversion is rounded 2211| according to the current rounding mode. If `a' is a NaN, the largest 2212| positive integer is returned. Otherwise, if the conversion overflows, the 2213| largest integer with the same sign as `a' is returned. 2214*----------------------------------------------------------------------------*/ 2215 2216int32 float64_to_int32( float64 a STATUS_PARAM ) 2217{ 2218 flag aSign; 2219 int16 aExp, shiftCount; 2220 bits64 aSig; 2221 2222 aSig = extractFloat64Frac( a ); 2223 aExp = extractFloat64Exp( a ); 2224 aSign = extractFloat64Sign( a ); 2225 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2226 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2227 shiftCount = 0x42C - aExp; 2228 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2229 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2230 2231} 2232 2233/*---------------------------------------------------------------------------- 2234| Returns the result of converting the double-precision floating-point value 2235| `a' to the 32-bit two's complement integer format. The conversion is 2236| performed according to the IEC/IEEE Standard for Binary Floating-Point 2237| Arithmetic, except that the conversion is always rounded toward zero. 2238| If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2239| the conversion overflows, the largest integer with the same sign as `a' is 2240| returned. 2241*----------------------------------------------------------------------------*/ 2242 2243int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2244{ 2245 flag aSign; 2246 int16 aExp, shiftCount; 2247 bits64 aSig, savedASig; 2248 int32 z; 2249 2250 aSig = extractFloat64Frac( a ); 2251 aExp = extractFloat64Exp( a ); 2252 aSign = extractFloat64Sign( a ); 2253 if ( 0x41E < aExp ) { 2254 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2255 goto invalid; 2256 } 2257 else if ( aExp < 0x3FF ) { 2258 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2259 return 0; 2260 } 2261 aSig |= LIT64( 0x0010000000000000 ); 2262 shiftCount = 0x433 - aExp; 2263 savedASig = aSig; 2264 aSig >>= shiftCount; 2265 z = aSig; 2266 if ( aSign ) z = - z; 2267 if ( ( z < 0 ) ^ aSign ) { 2268 invalid: 2269 float_raise( float_flag_invalid STATUS_VAR); 2270 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2271 } 2272 if ( ( aSig<<shiftCount ) != savedASig ) { 2273 STATUS(float_exception_flags) |= float_flag_inexact; 2274 } 2275 return z; 2276 2277} 2278 2279/*---------------------------------------------------------------------------- 2280| Returns the result of converting the double-precision floating-point value 2281| `a' to the 64-bit two's complement integer format. The conversion is 2282| performed according to the IEC/IEEE Standard for Binary Floating-Point 2283| Arithmetic---which means in particular that the conversion is rounded 2284| according to the current rounding mode. If `a' is a NaN, the largest 2285| positive integer is returned. Otherwise, if the conversion overflows, the 2286| largest integer with the same sign as `a' is returned. 2287*----------------------------------------------------------------------------*/ 2288 2289int64 float64_to_int64( float64 a STATUS_PARAM ) 2290{ 2291 flag aSign; 2292 int16 aExp, shiftCount; 2293 bits64 aSig, aSigExtra; 2294 2295 aSig = extractFloat64Frac( a ); 2296 aExp = extractFloat64Exp( a ); 2297 aSign = extractFloat64Sign( a ); 2298 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2299 shiftCount = 0x433 - aExp; 2300 if ( shiftCount <= 0 ) { 2301 if ( 0x43E < aExp ) { 2302 float_raise( float_flag_invalid STATUS_VAR); 2303 if ( ! aSign 2304 || ( ( aExp == 0x7FF ) 2305 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2306 ) { 2307 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2308 } 2309 return (sbits64) LIT64( 0x8000000000000000 ); 2310 } 2311 aSigExtra = 0; 2312 aSig <<= - shiftCount; 2313 } 2314 else { 2315 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2316 } 2317 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 2318 2319} 2320 2321/*---------------------------------------------------------------------------- 2322| Returns the result of converting the double-precision floating-point value 2323| `a' to the 64-bit two's complement integer format. The conversion is 2324| performed according to the IEC/IEEE Standard for Binary Floating-Point 2325| Arithmetic, except that the conversion is always rounded toward zero. 2326| If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2327| the conversion overflows, the largest integer with the same sign as `a' is 2328| returned. 2329*----------------------------------------------------------------------------*/ 2330 2331int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 2332{ 2333 flag aSign; 2334 int16 aExp, shiftCount; 2335 bits64 aSig; 2336 int64 z; 2337 2338 aSig = extractFloat64Frac( a ); 2339 aExp = extractFloat64Exp( a ); 2340 aSign = extractFloat64Sign( a ); 2341 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2342 shiftCount = aExp - 0x433; 2343 if ( 0 <= shiftCount ) { 2344 if ( 0x43E <= aExp ) { 2345 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 2346 float_raise( float_flag_invalid STATUS_VAR); 2347 if ( ! aSign 2348 || ( ( aExp == 0x7FF ) 2349 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2350 ) { 2351 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2352 } 2353 } 2354 return (sbits64) LIT64( 0x8000000000000000 ); 2355 } 2356 z = aSig<<shiftCount; 2357 } 2358 else { 2359 if ( aExp < 0x3FE ) { 2360 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2361 return 0; 2362 } 2363 z = aSig>>( - shiftCount ); 2364 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2365 STATUS(float_exception_flags) |= float_flag_inexact; 2366 } 2367 } 2368 if ( aSign ) z = - z; 2369 return z; 2370 2371} 2372 2373/*---------------------------------------------------------------------------- 2374| Returns the result of converting the double-precision floating-point value 2375| `a' to the single-precision floating-point format. The conversion is 2376| performed according to the IEC/IEEE Standard for Binary Floating-Point 2377| Arithmetic. 2378*----------------------------------------------------------------------------*/ 2379 2380float32 float64_to_float32( float64 a STATUS_PARAM ) 2381{ 2382 flag aSign; 2383 int16 aExp; 2384 bits64 aSig; 2385 bits32 zSig; 2386 2387 aSig = extractFloat64Frac( a ); 2388 aExp = extractFloat64Exp( a ); 2389 aSign = extractFloat64Sign( a ); 2390 if ( aExp == 0x7FF ) { 2391 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) ); 2392 return packFloat32( aSign, 0xFF, 0 ); 2393 } 2394 shift64RightJamming( aSig, 22, &aSig ); 2395 zSig = aSig; 2396 if ( aExp || zSig ) { 2397 zSig |= 0x40000000; 2398 aExp -= 0x381; 2399 } 2400 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 2401 2402} 2403 2404#ifdef FLOATX80 2405 2406/*---------------------------------------------------------------------------- 2407| Returns the result of converting the double-precision floating-point value 2408| `a' to the extended double-precision floating-point format. The conversion 2409| is performed according to the IEC/IEEE Standard for Binary Floating-Point 2410| Arithmetic. 2411*----------------------------------------------------------------------------*/ 2412 2413floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 2414{ 2415 flag aSign; 2416 int16 aExp; 2417 bits64 aSig; 2418 2419 aSig = extractFloat64Frac( a ); 2420 aExp = extractFloat64Exp( a ); 2421 aSign = extractFloat64Sign( a ); 2422 if ( aExp == 0x7FF ) { 2423 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) ); 2424 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2425 } 2426 if ( aExp == 0 ) { 2427 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2428 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2429 } 2430 return 2431 packFloatx80( 2432 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2433 2434} 2435 2436#endif 2437 2438#ifdef FLOAT128 2439 2440/*---------------------------------------------------------------------------- 2441| Returns the result of converting the double-precision floating-point value 2442| `a' to the quadruple-precision floating-point format. The conversion is 2443| performed according to the IEC/IEEE Standard for Binary Floating-Point 2444| Arithmetic. 2445*----------------------------------------------------------------------------*/ 2446 2447float128 float64_to_float128( float64 a STATUS_PARAM ) 2448{ 2449 flag aSign; 2450 int16 aExp; 2451 bits64 aSig, zSig0, zSig1; 2452 2453 aSig = extractFloat64Frac( a ); 2454 aExp = extractFloat64Exp( a ); 2455 aSign = extractFloat64Sign( a ); 2456 if ( aExp == 0x7FF ) { 2457 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) ); 2458 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2459 } 2460 if ( aExp == 0 ) { 2461 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2462 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2463 --aExp; 2464 } 2465 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2466 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2467 2468} 2469 2470#endif 2471 2472/*---------------------------------------------------------------------------- 2473| Rounds the double-precision floating-point value `a' to an integer, and 2474| returns the result as a double-precision floating-point value. The 2475| operation is performed according to the IEC/IEEE Standard for Binary 2476| Floating-Point Arithmetic. 2477*----------------------------------------------------------------------------*/ 2478 2479float64 float64_round_to_int( float64 a STATUS_PARAM ) 2480{ 2481 flag aSign; 2482 int16 aExp; 2483 bits64 lastBitMask, roundBitsMask; 2484 int8 roundingMode; 2485 bits64 z; 2486 2487 aExp = extractFloat64Exp( a ); 2488 if ( 0x433 <= aExp ) { 2489 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2490 return propagateFloat64NaN( a, a STATUS_VAR ); 2491 } 2492 return a; 2493 } 2494 if ( aExp < 0x3FF ) { 2495 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a; 2496 STATUS(float_exception_flags) |= float_flag_inexact; 2497 aSign = extractFloat64Sign( a ); 2498 switch ( STATUS(float_rounding_mode) ) { 2499 case float_round_nearest_even: 2500 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2501 return packFloat64( aSign, 0x3FF, 0 ); 2502 } 2503 break; 2504 case float_round_down: 2505 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 2506 case float_round_up: 2507 return make_float64( 2508 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 2509 } 2510 return packFloat64( aSign, 0, 0 ); 2511 } 2512 lastBitMask = 1; 2513 lastBitMask <<= 0x433 - aExp; 2514 roundBitsMask = lastBitMask - 1; 2515 z = float64_val(a); 2516 roundingMode = STATUS(float_rounding_mode); 2517 if ( roundingMode == float_round_nearest_even ) { 2518 z += lastBitMask>>1; 2519 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2520 } 2521 else if ( roundingMode != float_round_to_zero ) { 2522 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { 2523 z += roundBitsMask; 2524 } 2525 } 2526 z &= ~ roundBitsMask; 2527 if ( z != float64_val(a) ) 2528 STATUS(float_exception_flags) |= float_flag_inexact; 2529 return make_float64(z); 2530 2531} 2532 2533float64 float64_trunc_to_int( float64 a STATUS_PARAM) 2534{ 2535 int oldmode; 2536 float64 res; 2537 oldmode = STATUS(float_rounding_mode); 2538 STATUS(float_rounding_mode) = float_round_to_zero; 2539 res = float64_round_to_int(a STATUS_VAR); 2540 STATUS(float_rounding_mode) = oldmode; 2541 return res; 2542} 2543 2544/*---------------------------------------------------------------------------- 2545| Returns the result of adding the absolute values of the double-precision 2546| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2547| before being returned. `zSign' is ignored if the result is a NaN. 2548| The addition is performed according to the IEC/IEEE Standard for Binary 2549| Floating-Point Arithmetic. 2550*----------------------------------------------------------------------------*/ 2551 2552static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 2553{ 2554 int16 aExp, bExp, zExp; 2555 bits64 aSig, bSig, zSig; 2556 int16 expDiff; 2557 2558 aSig = extractFloat64Frac( a ); 2559 aExp = extractFloat64Exp( a ); 2560 bSig = extractFloat64Frac( b ); 2561 bExp = extractFloat64Exp( b ); 2562 expDiff = aExp - bExp; 2563 aSig <<= 9; 2564 bSig <<= 9; 2565 if ( 0 < expDiff ) { 2566 if ( aExp == 0x7FF ) { 2567 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2568 return a; 2569 } 2570 if ( bExp == 0 ) { 2571 --expDiff; 2572 } 2573 else { 2574 bSig |= LIT64( 0x2000000000000000 ); 2575 } 2576 shift64RightJamming( bSig, expDiff, &bSig ); 2577 zExp = aExp; 2578 } 2579 else if ( expDiff < 0 ) { 2580 if ( bExp == 0x7FF ) { 2581 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2582 return packFloat64( zSign, 0x7FF, 0 ); 2583 } 2584 if ( aExp == 0 ) { 2585 ++expDiff; 2586 } 2587 else { 2588 aSig |= LIT64( 0x2000000000000000 ); 2589 } 2590 shift64RightJamming( aSig, - expDiff, &aSig ); 2591 zExp = bExp; 2592 } 2593 else { 2594 if ( aExp == 0x7FF ) { 2595 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2596 return a; 2597 } 2598 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2599 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2600 zExp = aExp; 2601 goto roundAndPack; 2602 } 2603 aSig |= LIT64( 0x2000000000000000 ); 2604 zSig = ( aSig + bSig )<<1; 2605 --zExp; 2606 if ( (sbits64) zSig < 0 ) { 2607 zSig = aSig + bSig; 2608 ++zExp; 2609 } 2610 roundAndPack: 2611 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2612 2613} 2614 2615/*---------------------------------------------------------------------------- 2616| Returns the result of subtracting the absolute values of the double- 2617| precision floating-point values `a' and `b'. If `zSign' is 1, the 2618| difference is negated before being returned. `zSign' is ignored if the 2619| result is a NaN. The subtraction is performed according to the IEC/IEEE 2620| Standard for Binary Floating-Point Arithmetic. 2621*----------------------------------------------------------------------------*/ 2622 2623static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 2624{ 2625 int16 aExp, bExp, zExp; 2626 bits64 aSig, bSig, zSig; 2627 int16 expDiff; 2628 2629 aSig = extractFloat64Frac( a ); 2630 aExp = extractFloat64Exp( a ); 2631 bSig = extractFloat64Frac( b ); 2632 bExp = extractFloat64Exp( b ); 2633 expDiff = aExp - bExp; 2634 aSig <<= 10; 2635 bSig <<= 10; 2636 if ( 0 < expDiff ) goto aExpBigger; 2637 if ( expDiff < 0 ) goto bExpBigger; 2638 if ( aExp == 0x7FF ) { 2639 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2640 float_raise( float_flag_invalid STATUS_VAR); 2641 return float64_default_nan; 2642 } 2643 if ( aExp == 0 ) { 2644 aExp = 1; 2645 bExp = 1; 2646 } 2647 if ( bSig < aSig ) goto aBigger; 2648 if ( aSig < bSig ) goto bBigger; 2649 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 2650 bExpBigger: 2651 if ( bExp == 0x7FF ) { 2652 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2653 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2654 } 2655 if ( aExp == 0 ) { 2656 ++expDiff; 2657 } 2658 else { 2659 aSig |= LIT64( 0x4000000000000000 ); 2660 } 2661 shift64RightJamming( aSig, - expDiff, &aSig ); 2662 bSig |= LIT64( 0x4000000000000000 ); 2663 bBigger: 2664 zSig = bSig - aSig; 2665 zExp = bExp; 2666 zSign ^= 1; 2667 goto normalizeRoundAndPack; 2668 aExpBigger: 2669 if ( aExp == 0x7FF ) { 2670 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2671 return a; 2672 } 2673 if ( bExp == 0 ) { 2674 --expDiff; 2675 } 2676 else { 2677 bSig |= LIT64( 0x4000000000000000 ); 2678 } 2679 shift64RightJamming( bSig, expDiff, &bSig ); 2680 aSig |= LIT64( 0x4000000000000000 ); 2681 aBigger: 2682 zSig = aSig - bSig; 2683 zExp = aExp; 2684 normalizeRoundAndPack: 2685 --zExp; 2686 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2687 2688} 2689 2690/*---------------------------------------------------------------------------- 2691| Returns the result of adding the double-precision floating-point values `a' 2692| and `b'. The operation is performed according to the IEC/IEEE Standard for 2693| Binary Floating-Point Arithmetic. 2694*----------------------------------------------------------------------------*/ 2695 2696float64 float64_add( float64 a, float64 b STATUS_PARAM ) 2697{ 2698 flag aSign, bSign; 2699 2700 aSign = extractFloat64Sign( a ); 2701 bSign = extractFloat64Sign( b ); 2702 if ( aSign == bSign ) { 2703 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 2704 } 2705 else { 2706 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 2707 } 2708 2709} 2710 2711/*---------------------------------------------------------------------------- 2712| Returns the result of subtracting the double-precision floating-point values 2713| `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2714| for Binary Floating-Point Arithmetic. 2715*----------------------------------------------------------------------------*/ 2716 2717float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 2718{ 2719 flag aSign, bSign; 2720 2721 aSign = extractFloat64Sign( a ); 2722 bSign = extractFloat64Sign( b ); 2723 if ( aSign == bSign ) { 2724 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 2725 } 2726 else { 2727 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 2728 } 2729 2730} 2731 2732/*---------------------------------------------------------------------------- 2733| Returns the result of multiplying the double-precision floating-point values 2734| `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2735| for Binary Floating-Point Arithmetic. 2736*----------------------------------------------------------------------------*/ 2737 2738float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 2739{ 2740 flag aSign, bSign, zSign; 2741 int16 aExp, bExp, zExp; 2742 bits64 aSig, bSig, zSig0, zSig1; 2743 2744 aSig = extractFloat64Frac( a ); 2745 aExp = extractFloat64Exp( a ); 2746 aSign = extractFloat64Sign( a ); 2747 bSig = extractFloat64Frac( b ); 2748 bExp = extractFloat64Exp( b ); 2749 bSign = extractFloat64Sign( b ); 2750 zSign = aSign ^ bSign; 2751 if ( aExp == 0x7FF ) { 2752 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2753 return propagateFloat64NaN( a, b STATUS_VAR ); 2754 } 2755 if ( ( bExp | bSig ) == 0 ) { 2756 float_raise( float_flag_invalid STATUS_VAR); 2757 return float64_default_nan; 2758 } 2759 return packFloat64( zSign, 0x7FF, 0 ); 2760 } 2761 if ( bExp == 0x7FF ) { 2762 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2763 if ( ( aExp | aSig ) == 0 ) { 2764 float_raise( float_flag_invalid STATUS_VAR); 2765 return float64_default_nan; 2766 } 2767 return packFloat64( zSign, 0x7FF, 0 ); 2768 } 2769 if ( aExp == 0 ) { 2770 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2771 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2772 } 2773 if ( bExp == 0 ) { 2774 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2775 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2776 } 2777 zExp = aExp + bExp - 0x3FF; 2778 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2779 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2780 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2781 zSig0 |= ( zSig1 != 0 ); 2782 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2783 zSig0 <<= 1; 2784 --zExp; 2785 } 2786 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 2787 2788} 2789 2790/*---------------------------------------------------------------------------- 2791| Returns the result of dividing the double-precision floating-point value `a' 2792| by the corresponding value `b'. The operation is performed according to 2793| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2794*----------------------------------------------------------------------------*/ 2795 2796float64 float64_div( float64 a, float64 b STATUS_PARAM ) 2797{ 2798 flag aSign, bSign, zSign; 2799 int16 aExp, bExp, zExp; 2800 bits64 aSig, bSig, zSig; 2801 bits64 rem0, rem1; 2802 bits64 term0, term1; 2803 2804 aSig = extractFloat64Frac( a ); 2805 aExp = extractFloat64Exp( a ); 2806 aSign = extractFloat64Sign( a ); 2807 bSig = extractFloat64Frac( b ); 2808 bExp = extractFloat64Exp( b ); 2809 bSign = extractFloat64Sign( b ); 2810 zSign = aSign ^ bSign; 2811 if ( aExp == 0x7FF ) { 2812 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2813 if ( bExp == 0x7FF ) { 2814 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2815 float_raise( float_flag_invalid STATUS_VAR); 2816 return float64_default_nan; 2817 } 2818 return packFloat64( zSign, 0x7FF, 0 ); 2819 } 2820 if ( bExp == 0x7FF ) { 2821 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2822 return packFloat64( zSign, 0, 0 ); 2823 } 2824 if ( bExp == 0 ) { 2825 if ( bSig == 0 ) { 2826 if ( ( aExp | aSig ) == 0 ) { 2827 float_raise( float_flag_invalid STATUS_VAR); 2828 return float64_default_nan; 2829 } 2830 float_raise( float_flag_divbyzero STATUS_VAR); 2831 return packFloat64( zSign, 0x7FF, 0 ); 2832 } 2833 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2834 } 2835 if ( aExp == 0 ) { 2836 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2837 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2838 } 2839 zExp = aExp - bExp + 0x3FD; 2840 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2841 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2842 if ( bSig <= ( aSig + aSig ) ) { 2843 aSig >>= 1; 2844 ++zExp; 2845 } 2846 zSig = estimateDiv128To64( aSig, 0, bSig ); 2847 if ( ( zSig & 0x1FF ) <= 2 ) { 2848 mul64To128( bSig, zSig, &term0, &term1 ); 2849 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2850 while ( (sbits64) rem0 < 0 ) { 2851 --zSig; 2852 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2853 } 2854 zSig |= ( rem1 != 0 ); 2855 } 2856 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2857 2858} 2859 2860/*---------------------------------------------------------------------------- 2861| Returns the remainder of the double-precision floating-point value `a' 2862| with respect to the corresponding value `b'. The operation is performed 2863| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2864*----------------------------------------------------------------------------*/ 2865 2866float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 2867{ 2868 flag aSign, bSign, zSign; 2869 int16 aExp, bExp, expDiff; 2870 bits64 aSig, bSig; 2871 bits64 q, alternateASig; 2872 sbits64 sigMean; 2873 2874 aSig = extractFloat64Frac( a ); 2875 aExp = extractFloat64Exp( a ); 2876 aSign = extractFloat64Sign( a ); 2877 bSig = extractFloat64Frac( b ); 2878 bExp = extractFloat64Exp( b ); 2879 bSign = extractFloat64Sign( b ); 2880 if ( aExp == 0x7FF ) { 2881 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2882 return propagateFloat64NaN( a, b STATUS_VAR ); 2883 } 2884 float_raise( float_flag_invalid STATUS_VAR); 2885 return float64_default_nan; 2886 } 2887 if ( bExp == 0x7FF ) { 2888 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2889 return a; 2890 } 2891 if ( bExp == 0 ) { 2892 if ( bSig == 0 ) { 2893 float_raise( float_flag_invalid STATUS_VAR); 2894 return float64_default_nan; 2895 } 2896 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2897 } 2898 if ( aExp == 0 ) { 2899 if ( aSig == 0 ) return a; 2900 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2901 } 2902 expDiff = aExp - bExp; 2903 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2904 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2905 if ( expDiff < 0 ) { 2906 if ( expDiff < -1 ) return a; 2907 aSig >>= 1; 2908 } 2909 q = ( bSig <= aSig ); 2910 if ( q ) aSig -= bSig; 2911 expDiff -= 64; 2912 while ( 0 < expDiff ) { 2913 q = estimateDiv128To64( aSig, 0, bSig ); 2914 q = ( 2 < q ) ? q - 2 : 0; 2915 aSig = - ( ( bSig>>2 ) * q ); 2916 expDiff -= 62; 2917 } 2918 expDiff += 64; 2919 if ( 0 < expDiff ) { 2920 q = estimateDiv128To64( aSig, 0, bSig ); 2921 q = ( 2 < q ) ? q - 2 : 0; 2922 q >>= 64 - expDiff; 2923 bSig >>= 2; 2924 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2925 } 2926 else { 2927 aSig >>= 2; 2928 bSig >>= 2; 2929 } 2930 do { 2931 alternateASig = aSig; 2932 ++q; 2933 aSig -= bSig; 2934 } while ( 0 <= (sbits64) aSig ); 2935 sigMean = aSig + alternateASig; 2936 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2937 aSig = alternateASig; 2938 } 2939 zSign = ( (sbits64) aSig < 0 ); 2940 if ( zSign ) aSig = - aSig; 2941 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2942 2943} 2944 2945/*---------------------------------------------------------------------------- 2946| Returns the square root of the double-precision floating-point value `a'. 2947| The operation is performed according to the IEC/IEEE Standard for Binary 2948| Floating-Point Arithmetic. 2949*----------------------------------------------------------------------------*/ 2950 2951float64 float64_sqrt( float64 a STATUS_PARAM ) 2952{ 2953 flag aSign; 2954 int16 aExp, zExp; 2955 bits64 aSig, zSig, doubleZSig; 2956 bits64 rem0, rem1, term0, term1; 2957 2958 aSig = extractFloat64Frac( a ); 2959 aExp = extractFloat64Exp( a ); 2960 aSign = extractFloat64Sign( a ); 2961 if ( aExp == 0x7FF ) { 2962 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 2963 if ( ! aSign ) return a; 2964 float_raise( float_flag_invalid STATUS_VAR); 2965 return float64_default_nan; 2966 } 2967 if ( aSign ) { 2968 if ( ( aExp | aSig ) == 0 ) return a; 2969 float_raise( float_flag_invalid STATUS_VAR); 2970 return float64_default_nan; 2971 } 2972 if ( aExp == 0 ) { 2973 if ( aSig == 0 ) return float64_zero; 2974 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2975 } 2976 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2977 aSig |= LIT64( 0x0010000000000000 ); 2978 zSig = estimateSqrt32( aExp, aSig>>21 ); 2979 aSig <<= 9 - ( aExp & 1 ); 2980 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 2981 if ( ( zSig & 0x1FF ) <= 5 ) { 2982 doubleZSig = zSig<<1; 2983 mul64To128( zSig, zSig, &term0, &term1 ); 2984 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2985 while ( (sbits64) rem0 < 0 ) { 2986 --zSig; 2987 doubleZSig -= 2; 2988 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 2989 } 2990 zSig |= ( ( rem0 | rem1 ) != 0 ); 2991 } 2992 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 2993 2994} 2995 2996/*---------------------------------------------------------------------------- 2997| Returns 1 if the double-precision floating-point value `a' is equal to the 2998| corresponding value `b', and 0 otherwise. The comparison is performed 2999| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3000*----------------------------------------------------------------------------*/ 3001 3002int float64_eq( float64 a, float64 b STATUS_PARAM ) 3003{ 3004 bits64 av, bv; 3005 3006 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3007 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3008 ) { 3009 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3010 float_raise( float_flag_invalid STATUS_VAR); 3011 } 3012 return 0; 3013 } 3014 av = float64_val(a); 3015 bv = float64_val(b); 3016 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3017 3018} 3019 3020/*---------------------------------------------------------------------------- 3021| Returns 1 if the double-precision floating-point value `a' is less than or 3022| equal to the corresponding value `b', and 0 otherwise. The comparison is 3023| performed according to the IEC/IEEE Standard for Binary Floating-Point 3024| Arithmetic. 3025*----------------------------------------------------------------------------*/ 3026 3027int float64_le( float64 a, float64 b STATUS_PARAM ) 3028{ 3029 flag aSign, bSign; 3030 bits64 av, bv; 3031 3032 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3033 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3034 ) { 3035 float_raise( float_flag_invalid STATUS_VAR); 3036 return 0; 3037 } 3038 aSign = extractFloat64Sign( a ); 3039 bSign = extractFloat64Sign( b ); 3040 av = float64_val(a); 3041 bv = float64_val(b); 3042 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3043 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3044 3045} 3046 3047/*---------------------------------------------------------------------------- 3048| Returns 1 if the double-precision floating-point value `a' is less than 3049| the corresponding value `b', and 0 otherwise. The comparison is performed 3050| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3051*----------------------------------------------------------------------------*/ 3052 3053int float64_lt( float64 a, float64 b STATUS_PARAM ) 3054{ 3055 flag aSign, bSign; 3056 bits64 av, bv; 3057 3058 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3059 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3060 ) { 3061 float_raise( float_flag_invalid STATUS_VAR); 3062 return 0; 3063 } 3064 aSign = extractFloat64Sign( a ); 3065 bSign = extractFloat64Sign( b ); 3066 av = float64_val(a); 3067 bv = float64_val(b); 3068 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3069 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3070 3071} 3072 3073/*---------------------------------------------------------------------------- 3074| Returns 1 if the double-precision floating-point value `a' is equal to the 3075| corresponding value `b', and 0 otherwise. The invalid exception is raised 3076| if either operand is a NaN. Otherwise, the comparison is performed 3077| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3078*----------------------------------------------------------------------------*/ 3079 3080int float64_eq_signaling( float64 a, float64 b STATUS_PARAM ) 3081{ 3082 bits64 av, bv; 3083 3084 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3085 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3086 ) { 3087 float_raise( float_flag_invalid STATUS_VAR); 3088 return 0; 3089 } 3090 av = float64_val(a); 3091 bv = float64_val(b); 3092 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3093 3094} 3095 3096/*---------------------------------------------------------------------------- 3097| Returns 1 if the double-precision floating-point value `a' is less than or 3098| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3099| cause an exception. Otherwise, the comparison is performed according to the 3100| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3101*----------------------------------------------------------------------------*/ 3102 3103int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 3104{ 3105 flag aSign, bSign; 3106 bits64 av, bv; 3107 3108 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3109 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3110 ) { 3111 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3112 float_raise( float_flag_invalid STATUS_VAR); 3113 } 3114 return 0; 3115 } 3116 aSign = extractFloat64Sign( a ); 3117 bSign = extractFloat64Sign( b ); 3118 av = float64_val(a); 3119 bv = float64_val(b); 3120 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3121 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3122 3123} 3124 3125/*---------------------------------------------------------------------------- 3126| Returns 1 if the double-precision floating-point value `a' is less than 3127| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3128| exception. Otherwise, the comparison is performed according to the IEC/IEEE 3129| Standard for Binary Floating-Point Arithmetic. 3130*----------------------------------------------------------------------------*/ 3131 3132int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 3133{ 3134 flag aSign, bSign; 3135 bits64 av, bv; 3136 3137 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3138 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3139 ) { 3140 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3141 float_raise( float_flag_invalid STATUS_VAR); 3142 } 3143 return 0; 3144 } 3145 aSign = extractFloat64Sign( a ); 3146 bSign = extractFloat64Sign( b ); 3147 av = float64_val(a); 3148 bv = float64_val(b); 3149 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3150 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3151 3152} 3153 3154#ifdef FLOATX80 3155 3156/*---------------------------------------------------------------------------- 3157| Returns the result of converting the extended double-precision floating- 3158| point value `a' to the 32-bit two's complement integer format. The 3159| conversion is performed according to the IEC/IEEE Standard for Binary 3160| Floating-Point Arithmetic---which means in particular that the conversion 3161| is rounded according to the current rounding mode. If `a' is a NaN, the 3162| largest positive integer is returned. Otherwise, if the conversion 3163| overflows, the largest integer with the same sign as `a' is returned. 3164*----------------------------------------------------------------------------*/ 3165 3166int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 3167{ 3168 flag aSign; 3169 int32 aExp, shiftCount; 3170 bits64 aSig; 3171 3172 aSig = extractFloatx80Frac( a ); 3173 aExp = extractFloatx80Exp( a ); 3174 aSign = extractFloatx80Sign( a ); 3175 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3176 shiftCount = 0x4037 - aExp; 3177 if ( shiftCount <= 0 ) shiftCount = 1; 3178 shift64RightJamming( aSig, shiftCount, &aSig ); 3179 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 3180 3181} 3182 3183/*---------------------------------------------------------------------------- 3184| Returns the result of converting the extended double-precision floating- 3185| point value `a' to the 32-bit two's complement integer format. The 3186| conversion is performed according to the IEC/IEEE Standard for Binary 3187| Floating-Point Arithmetic, except that the conversion is always rounded 3188| toward zero. If `a' is a NaN, the largest positive integer is returned. 3189| Otherwise, if the conversion overflows, the largest integer with the same 3190| sign as `a' is returned. 3191*----------------------------------------------------------------------------*/ 3192 3193int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 3194{ 3195 flag aSign; 3196 int32 aExp, shiftCount; 3197 bits64 aSig, savedASig; 3198 int32 z; 3199 3200 aSig = extractFloatx80Frac( a ); 3201 aExp = extractFloatx80Exp( a ); 3202 aSign = extractFloatx80Sign( a ); 3203 if ( 0x401E < aExp ) { 3204 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3205 goto invalid; 3206 } 3207 else if ( aExp < 0x3FFF ) { 3208 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3209 return 0; 3210 } 3211 shiftCount = 0x403E - aExp; 3212 savedASig = aSig; 3213 aSig >>= shiftCount; 3214 z = aSig; 3215 if ( aSign ) z = - z; 3216 if ( ( z < 0 ) ^ aSign ) { 3217 invalid: 3218 float_raise( float_flag_invalid STATUS_VAR); 3219 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3220 } 3221 if ( ( aSig<<shiftCount ) != savedASig ) { 3222 STATUS(float_exception_flags) |= float_flag_inexact; 3223 } 3224 return z; 3225 3226} 3227 3228/*---------------------------------------------------------------------------- 3229| Returns the result of converting the extended double-precision floating- 3230| point value `a' to the 64-bit two's complement integer format. The 3231| conversion is performed according to the IEC/IEEE Standard for Binary 3232| Floating-Point Arithmetic---which means in particular that the conversion 3233| is rounded according to the current rounding mode. If `a' is a NaN, 3234| the largest positive integer is returned. Otherwise, if the conversion 3235| overflows, the largest integer with the same sign as `a' is returned. 3236*----------------------------------------------------------------------------*/ 3237 3238int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 3239{ 3240 flag aSign; 3241 int32 aExp, shiftCount; 3242 bits64 aSig, aSigExtra; 3243 3244 aSig = extractFloatx80Frac( a ); 3245 aExp = extractFloatx80Exp( a ); 3246 aSign = extractFloatx80Sign( a ); 3247 shiftCount = 0x403E - aExp; 3248 if ( shiftCount <= 0 ) { 3249 if ( shiftCount ) { 3250 float_raise( float_flag_invalid STATUS_VAR); 3251 if ( ! aSign 3252 || ( ( aExp == 0x7FFF ) 3253 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3254 ) { 3255 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3256 } 3257 return (sbits64) LIT64( 0x8000000000000000 ); 3258 } 3259 aSigExtra = 0; 3260 } 3261 else { 3262 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3263 } 3264 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3265 3266} 3267 3268/*---------------------------------------------------------------------------- 3269| Returns the result of converting the extended double-precision floating- 3270| point value `a' to the 64-bit two's complement integer format. The 3271| conversion is performed according to the IEC/IEEE Standard for Binary 3272| Floating-Point Arithmetic, except that the conversion is always rounded 3273| toward zero. If `a' is a NaN, the largest positive integer is returned. 3274| Otherwise, if the conversion overflows, the largest integer with the same 3275| sign as `a' is returned. 3276*----------------------------------------------------------------------------*/ 3277 3278int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 3279{ 3280 flag aSign; 3281 int32 aExp, shiftCount; 3282 bits64 aSig; 3283 int64 z; 3284 3285 aSig = extractFloatx80Frac( a ); 3286 aExp = extractFloatx80Exp( a ); 3287 aSign = extractFloatx80Sign( a ); 3288 shiftCount = aExp - 0x403E; 3289 if ( 0 <= shiftCount ) { 3290 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3291 if ( ( a.high != 0xC03E ) || aSig ) { 3292 float_raise( float_flag_invalid STATUS_VAR); 3293 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3294 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3295 } 3296 } 3297 return (sbits64) LIT64( 0x8000000000000000 ); 3298 } 3299 else if ( aExp < 0x3FFF ) { 3300 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3301 return 0; 3302 } 3303 z = aSig>>( - shiftCount ); 3304 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3305 STATUS(float_exception_flags) |= float_flag_inexact; 3306 } 3307 if ( aSign ) z = - z; 3308 return z; 3309 3310} 3311 3312/*---------------------------------------------------------------------------- 3313| Returns the result of converting the extended double-precision floating- 3314| point value `a' to the single-precision floating-point format. The 3315| conversion is performed according to the IEC/IEEE Standard for Binary 3316| Floating-Point Arithmetic. 3317*----------------------------------------------------------------------------*/ 3318 3319float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 3320{ 3321 flag aSign; 3322 int32 aExp; 3323 bits64 aSig; 3324 3325 aSig = extractFloatx80Frac( a ); 3326 aExp = extractFloatx80Exp( a ); 3327 aSign = extractFloatx80Sign( a ); 3328 if ( aExp == 0x7FFF ) { 3329 if ( (bits64) ( aSig<<1 ) ) { 3330 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) ); 3331 } 3332 return packFloat32( aSign, 0xFF, 0 ); 3333 } 3334 shift64RightJamming( aSig, 33, &aSig ); 3335 if ( aExp || aSig ) aExp -= 0x3F81; 3336 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 3337 3338} 3339 3340/*---------------------------------------------------------------------------- 3341| Returns the result of converting the extended double-precision floating- 3342| point value `a' to the double-precision floating-point format. The 3343| conversion is performed according to the IEC/IEEE Standard for Binary 3344| Floating-Point Arithmetic. 3345*----------------------------------------------------------------------------*/ 3346 3347float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 3348{ 3349 flag aSign; 3350 int32 aExp; 3351 bits64 aSig, zSig; 3352 3353 aSig = extractFloatx80Frac( a ); 3354 aExp = extractFloatx80Exp( a ); 3355 aSign = extractFloatx80Sign( a ); 3356 if ( aExp == 0x7FFF ) { 3357 if ( (bits64) ( aSig<<1 ) ) { 3358 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) ); 3359 } 3360 return packFloat64( aSign, 0x7FF, 0 ); 3361 } 3362 shift64RightJamming( aSig, 1, &zSig ); 3363 if ( aExp || aSig ) aExp -= 0x3C01; 3364 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 3365 3366} 3367 3368#ifdef FLOAT128 3369 3370/*---------------------------------------------------------------------------- 3371| Returns the result of converting the extended double-precision floating- 3372| point value `a' to the quadruple-precision floating-point format. The 3373| conversion is performed according to the IEC/IEEE Standard for Binary 3374| Floating-Point Arithmetic. 3375*----------------------------------------------------------------------------*/ 3376 3377float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 3378{ 3379 flag aSign; 3380 int16 aExp; 3381 bits64 aSig, zSig0, zSig1; 3382 3383 aSig = extractFloatx80Frac( a ); 3384 aExp = extractFloatx80Exp( a ); 3385 aSign = extractFloatx80Sign( a ); 3386 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3387 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) ); 3388 } 3389 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3390 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3391 3392} 3393 3394#endif 3395 3396/*---------------------------------------------------------------------------- 3397| Rounds the extended double-precision floating-point value `a' to an integer, 3398| and returns the result as an extended quadruple-precision floating-point 3399| value. The operation is performed according to the IEC/IEEE Standard for 3400| Binary Floating-Point Arithmetic. 3401*----------------------------------------------------------------------------*/ 3402 3403floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 3404{ 3405 flag aSign; 3406 int32 aExp; 3407 bits64 lastBitMask, roundBitsMask; 3408 int8 roundingMode; 3409 floatx80 z; 3410 3411 aExp = extractFloatx80Exp( a ); 3412 if ( 0x403E <= aExp ) { 3413 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3414 return propagateFloatx80NaN( a, a STATUS_VAR ); 3415 } 3416 return a; 3417 } 3418 if ( aExp < 0x3FFF ) { 3419 if ( ( aExp == 0 ) 3420 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3421 return a; 3422 } 3423 STATUS(float_exception_flags) |= float_flag_inexact; 3424 aSign = extractFloatx80Sign( a ); 3425 switch ( STATUS(float_rounding_mode) ) { 3426 case float_round_nearest_even: 3427 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3428 ) { 3429 return 3430 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3431 } 3432 break; 3433 case float_round_down: 3434 return 3435 aSign ? 3436 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3437 : packFloatx80( 0, 0, 0 ); 3438 case float_round_up: 3439 return 3440 aSign ? packFloatx80( 1, 0, 0 ) 3441 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3442 } 3443 return packFloatx80( aSign, 0, 0 ); 3444 } 3445 lastBitMask = 1; 3446 lastBitMask <<= 0x403E - aExp; 3447 roundBitsMask = lastBitMask - 1; 3448 z = a; 3449 roundingMode = STATUS(float_rounding_mode); 3450 if ( roundingMode == float_round_nearest_even ) { 3451 z.low += lastBitMask>>1; 3452 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3453 } 3454 else if ( roundingMode != float_round_to_zero ) { 3455 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3456 z.low += roundBitsMask; 3457 } 3458 } 3459 z.low &= ~ roundBitsMask; 3460 if ( z.low == 0 ) { 3461 ++z.high; 3462 z.low = LIT64( 0x8000000000000000 ); 3463 } 3464 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 3465 return z; 3466 3467} 3468 3469/*---------------------------------------------------------------------------- 3470| Returns the result of adding the absolute values of the extended double- 3471| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3472| negated before being returned. `zSign' is ignored if the result is a NaN. 3473| The addition is performed according to the IEC/IEEE Standard for Binary 3474| Floating-Point Arithmetic. 3475*----------------------------------------------------------------------------*/ 3476 3477static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 3478{ 3479 int32 aExp, bExp, zExp; 3480 bits64 aSig, bSig, zSig0, zSig1; 3481 int32 expDiff; 3482 3483 aSig = extractFloatx80Frac( a ); 3484 aExp = extractFloatx80Exp( a ); 3485 bSig = extractFloatx80Frac( b ); 3486 bExp = extractFloatx80Exp( b ); 3487 expDiff = aExp - bExp; 3488 if ( 0 < expDiff ) { 3489 if ( aExp == 0x7FFF ) { 3490 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3491 return a; 3492 } 3493 if ( bExp == 0 ) --expDiff; 3494 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3495 zExp = aExp; 3496 } 3497 else if ( expDiff < 0 ) { 3498 if ( bExp == 0x7FFF ) { 3499 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3500 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3501 } 3502 if ( aExp == 0 ) ++expDiff; 3503 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3504 zExp = bExp; 3505 } 3506 else { 3507 if ( aExp == 0x7FFF ) { 3508 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3509 return propagateFloatx80NaN( a, b STATUS_VAR ); 3510 } 3511 return a; 3512 } 3513 zSig1 = 0; 3514 zSig0 = aSig + bSig; 3515 if ( aExp == 0 ) { 3516 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3517 goto roundAndPack; 3518 } 3519 zExp = aExp; 3520 goto shiftRight1; 3521 } 3522 zSig0 = aSig + bSig; 3523 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3524 shiftRight1: 3525 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3526 zSig0 |= LIT64( 0x8000000000000000 ); 3527 ++zExp; 3528 roundAndPack: 3529 return 3530 roundAndPackFloatx80( 3531 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3532 3533} 3534 3535/*---------------------------------------------------------------------------- 3536| Returns the result of subtracting the absolute values of the extended 3537| double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3538| difference is negated before being returned. `zSign' is ignored if the 3539| result is a NaN. The subtraction is performed according to the IEC/IEEE 3540| Standard for Binary Floating-Point Arithmetic. 3541*----------------------------------------------------------------------------*/ 3542 3543static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 3544{ 3545 int32 aExp, bExp, zExp; 3546 bits64 aSig, bSig, zSig0, zSig1; 3547 int32 expDiff; 3548 floatx80 z; 3549 3550 aSig = extractFloatx80Frac( a ); 3551 aExp = extractFloatx80Exp( a ); 3552 bSig = extractFloatx80Frac( b ); 3553 bExp = extractFloatx80Exp( b ); 3554 expDiff = aExp - bExp; 3555 if ( 0 < expDiff ) goto aExpBigger; 3556 if ( expDiff < 0 ) goto bExpBigger; 3557 if ( aExp == 0x7FFF ) { 3558 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3559 return propagateFloatx80NaN( a, b STATUS_VAR ); 3560 } 3561 float_raise( float_flag_invalid STATUS_VAR); 3562 z.low = floatx80_default_nan_low; 3563 z.high = floatx80_default_nan_high; 3564 return z; 3565 } 3566 if ( aExp == 0 ) { 3567 aExp = 1; 3568 bExp = 1; 3569 } 3570 zSig1 = 0; 3571 if ( bSig < aSig ) goto aBigger; 3572 if ( aSig < bSig ) goto bBigger; 3573 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3574 bExpBigger: 3575 if ( bExp == 0x7FFF ) { 3576 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3577 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3578 } 3579 if ( aExp == 0 ) ++expDiff; 3580 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3581 bBigger: 3582 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3583 zExp = bExp; 3584 zSign ^= 1; 3585 goto normalizeRoundAndPack; 3586 aExpBigger: 3587 if ( aExp == 0x7FFF ) { 3588 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3589 return a; 3590 } 3591 if ( bExp == 0 ) --expDiff; 3592 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3593 aBigger: 3594 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3595 zExp = aExp; 3596 normalizeRoundAndPack: 3597 return 3598 normalizeRoundAndPackFloatx80( 3599 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3600 3601} 3602 3603/*---------------------------------------------------------------------------- 3604| Returns the result of adding the extended double-precision floating-point 3605| values `a' and `b'. The operation is performed according to the IEC/IEEE 3606| Standard for Binary Floating-Point Arithmetic. 3607*----------------------------------------------------------------------------*/ 3608 3609floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 3610{ 3611 flag aSign, bSign; 3612 3613 aSign = extractFloatx80Sign( a ); 3614 bSign = extractFloatx80Sign( b ); 3615 if ( aSign == bSign ) { 3616 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 3617 } 3618 else { 3619 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 3620 } 3621 3622} 3623 3624/*---------------------------------------------------------------------------- 3625| Returns the result of subtracting the extended double-precision floating- 3626| point values `a' and `b'. The operation is performed according to the 3627| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3628*----------------------------------------------------------------------------*/ 3629 3630floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 3631{ 3632 flag aSign, bSign; 3633 3634 aSign = extractFloatx80Sign( a ); 3635 bSign = extractFloatx80Sign( b ); 3636 if ( aSign == bSign ) { 3637 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 3638 } 3639 else { 3640 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 3641 } 3642 3643} 3644 3645/*---------------------------------------------------------------------------- 3646| Returns the result of multiplying the extended double-precision floating- 3647| point values `a' and `b'. The operation is performed according to the 3648| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3649*----------------------------------------------------------------------------*/ 3650 3651floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 3652{ 3653 flag aSign, bSign, zSign; 3654 int32 aExp, bExp, zExp; 3655 bits64 aSig, bSig, zSig0, zSig1; 3656 floatx80 z; 3657 3658 aSig = extractFloatx80Frac( a ); 3659 aExp = extractFloatx80Exp( a ); 3660 aSign = extractFloatx80Sign( a ); 3661 bSig = extractFloatx80Frac( b ); 3662 bExp = extractFloatx80Exp( b ); 3663 bSign = extractFloatx80Sign( b ); 3664 zSign = aSign ^ bSign; 3665 if ( aExp == 0x7FFF ) { 3666 if ( (bits64) ( aSig<<1 ) 3667 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3668 return propagateFloatx80NaN( a, b STATUS_VAR ); 3669 } 3670 if ( ( bExp | bSig ) == 0 ) goto invalid; 3671 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3672 } 3673 if ( bExp == 0x7FFF ) { 3674 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3675 if ( ( aExp | aSig ) == 0 ) { 3676 invalid: 3677 float_raise( float_flag_invalid STATUS_VAR); 3678 z.low = floatx80_default_nan_low; 3679 z.high = floatx80_default_nan_high; 3680 return z; 3681 } 3682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3683 } 3684 if ( aExp == 0 ) { 3685 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3686 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3687 } 3688 if ( bExp == 0 ) { 3689 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3690 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3691 } 3692 zExp = aExp + bExp - 0x3FFE; 3693 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3694 if ( 0 < (sbits64) zSig0 ) { 3695 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3696 --zExp; 3697 } 3698 return 3699 roundAndPackFloatx80( 3700 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3701 3702} 3703 3704/*---------------------------------------------------------------------------- 3705| Returns the result of dividing the extended double-precision floating-point 3706| value `a' by the corresponding value `b'. The operation is performed 3707| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3708*----------------------------------------------------------------------------*/ 3709 3710floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 3711{ 3712 flag aSign, bSign, zSign; 3713 int32 aExp, bExp, zExp; 3714 bits64 aSig, bSig, zSig0, zSig1; 3715 bits64 rem0, rem1, rem2, term0, term1, term2; 3716 floatx80 z; 3717 3718 aSig = extractFloatx80Frac( a ); 3719 aExp = extractFloatx80Exp( a ); 3720 aSign = extractFloatx80Sign( a ); 3721 bSig = extractFloatx80Frac( b ); 3722 bExp = extractFloatx80Exp( b ); 3723 bSign = extractFloatx80Sign( b ); 3724 zSign = aSign ^ bSign; 3725 if ( aExp == 0x7FFF ) { 3726 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3727 if ( bExp == 0x7FFF ) { 3728 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3729 goto invalid; 3730 } 3731 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3732 } 3733 if ( bExp == 0x7FFF ) { 3734 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3735 return packFloatx80( zSign, 0, 0 ); 3736 } 3737 if ( bExp == 0 ) { 3738 if ( bSig == 0 ) { 3739 if ( ( aExp | aSig ) == 0 ) { 3740 invalid: 3741 float_raise( float_flag_invalid STATUS_VAR); 3742 z.low = floatx80_default_nan_low; 3743 z.high = floatx80_default_nan_high; 3744 return z; 3745 } 3746 float_raise( float_flag_divbyzero STATUS_VAR); 3747 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3748 } 3749 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3750 } 3751 if ( aExp == 0 ) { 3752 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3753 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3754 } 3755 zExp = aExp - bExp + 0x3FFE; 3756 rem1 = 0; 3757 if ( bSig <= aSig ) { 3758 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3759 ++zExp; 3760 } 3761 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3762 mul64To128( bSig, zSig0, &term0, &term1 ); 3763 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3764 while ( (sbits64) rem0 < 0 ) { 3765 --zSig0; 3766 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3767 } 3768 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3769 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3770 mul64To128( bSig, zSig1, &term1, &term2 ); 3771 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3772 while ( (sbits64) rem1 < 0 ) { 3773 --zSig1; 3774 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3775 } 3776 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3777 } 3778 return 3779 roundAndPackFloatx80( 3780 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3781 3782} 3783 3784/*---------------------------------------------------------------------------- 3785| Returns the remainder of the extended double-precision floating-point value 3786| `a' with respect to the corresponding value `b'. The operation is performed 3787| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3788*----------------------------------------------------------------------------*/ 3789 3790floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 3791{ 3792 flag aSign, bSign, zSign; 3793 int32 aExp, bExp, expDiff; 3794 bits64 aSig0, aSig1, bSig; 3795 bits64 q, term0, term1, alternateASig0, alternateASig1; 3796 floatx80 z; 3797 3798 aSig0 = extractFloatx80Frac( a ); 3799 aExp = extractFloatx80Exp( a ); 3800 aSign = extractFloatx80Sign( a ); 3801 bSig = extractFloatx80Frac( b ); 3802 bExp = extractFloatx80Exp( b ); 3803 bSign = extractFloatx80Sign( b ); 3804 if ( aExp == 0x7FFF ) { 3805 if ( (bits64) ( aSig0<<1 ) 3806 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3807 return propagateFloatx80NaN( a, b STATUS_VAR ); 3808 } 3809 goto invalid; 3810 } 3811 if ( bExp == 0x7FFF ) { 3812 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3813 return a; 3814 } 3815 if ( bExp == 0 ) { 3816 if ( bSig == 0 ) { 3817 invalid: 3818 float_raise( float_flag_invalid STATUS_VAR); 3819 z.low = floatx80_default_nan_low; 3820 z.high = floatx80_default_nan_high; 3821 return z; 3822 } 3823 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3824 } 3825 if ( aExp == 0 ) { 3826 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3827 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3828 } 3829 bSig |= LIT64( 0x8000000000000000 ); 3830 zSign = aSign; 3831 expDiff = aExp - bExp; 3832 aSig1 = 0; 3833 if ( expDiff < 0 ) { 3834 if ( expDiff < -1 ) return a; 3835 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3836 expDiff = 0; 3837 } 3838 q = ( bSig <= aSig0 ); 3839 if ( q ) aSig0 -= bSig; 3840 expDiff -= 64; 3841 while ( 0 < expDiff ) { 3842 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3843 q = ( 2 < q ) ? q - 2 : 0; 3844 mul64To128( bSig, q, &term0, &term1 ); 3845 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3846 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3847 expDiff -= 62; 3848 } 3849 expDiff += 64; 3850 if ( 0 < expDiff ) { 3851 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3852 q = ( 2 < q ) ? q - 2 : 0; 3853 q >>= 64 - expDiff; 3854 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3855 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3856 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3857 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3858 ++q; 3859 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3860 } 3861 } 3862 else { 3863 term1 = 0; 3864 term0 = bSig; 3865 } 3866 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3867 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3868 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3869 && ( q & 1 ) ) 3870 ) { 3871 aSig0 = alternateASig0; 3872 aSig1 = alternateASig1; 3873 zSign = ! zSign; 3874 } 3875 return 3876 normalizeRoundAndPackFloatx80( 3877 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 3878 3879} 3880 3881/*---------------------------------------------------------------------------- 3882| Returns the square root of the extended double-precision floating-point 3883| value `a'. The operation is performed according to the IEC/IEEE Standard 3884| for Binary Floating-Point Arithmetic. 3885*----------------------------------------------------------------------------*/ 3886 3887floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 3888{ 3889 flag aSign; 3890 int32 aExp, zExp; 3891 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3892 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3893 floatx80 z; 3894 3895 aSig0 = extractFloatx80Frac( a ); 3896 aExp = extractFloatx80Exp( a ); 3897 aSign = extractFloatx80Sign( a ); 3898 if ( aExp == 0x7FFF ) { 3899 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 3900 if ( ! aSign ) return a; 3901 goto invalid; 3902 } 3903 if ( aSign ) { 3904 if ( ( aExp | aSig0 ) == 0 ) return a; 3905 invalid: 3906 float_raise( float_flag_invalid STATUS_VAR); 3907 z.low = floatx80_default_nan_low; 3908 z.high = floatx80_default_nan_high; 3909 return z; 3910 } 3911 if ( aExp == 0 ) { 3912 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 3913 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3914 } 3915 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3916 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 3917 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 3918 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 3919 doubleZSig0 = zSig0<<1; 3920 mul64To128( zSig0, zSig0, &term0, &term1 ); 3921 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 3922 while ( (sbits64) rem0 < 0 ) { 3923 --zSig0; 3924 doubleZSig0 -= 2; 3925 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 3926 } 3927 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 3928 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 3929 if ( zSig1 == 0 ) zSig1 = 1; 3930 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 3931 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3932 mul64To128( zSig1, zSig1, &term2, &term3 ); 3933 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 3934 while ( (sbits64) rem1 < 0 ) { 3935 --zSig1; 3936 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 3937 term3 |= 1; 3938 term2 |= doubleZSig0; 3939 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 3940 } 3941 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 3942 } 3943 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 3944 zSig0 |= doubleZSig0; 3945 return 3946 roundAndPackFloatx80( 3947 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 3948 3949} 3950 3951/*---------------------------------------------------------------------------- 3952| Returns 1 if the extended double-precision floating-point value `a' is 3953| equal to the corresponding value `b', and 0 otherwise. The comparison is 3954| performed according to the IEC/IEEE Standard for Binary Floating-Point 3955| Arithmetic. 3956*----------------------------------------------------------------------------*/ 3957 3958int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 3959{ 3960 3961 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3962 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3963 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3964 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3965 ) { 3966 if ( floatx80_is_signaling_nan( a ) 3967 || floatx80_is_signaling_nan( b ) ) { 3968 float_raise( float_flag_invalid STATUS_VAR); 3969 } 3970 return 0; 3971 } 3972 return 3973 ( a.low == b.low ) 3974 && ( ( a.high == b.high ) 3975 || ( ( a.low == 0 ) 3976 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3977 ); 3978 3979} 3980 3981/*---------------------------------------------------------------------------- 3982| Returns 1 if the extended double-precision floating-point value `a' is 3983| less than or equal to the corresponding value `b', and 0 otherwise. The 3984| comparison is performed according to the IEC/IEEE Standard for Binary 3985| Floating-Point Arithmetic. 3986*----------------------------------------------------------------------------*/ 3987 3988int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 3989{ 3990 flag aSign, bSign; 3991 3992 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3993 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3994 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3995 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3996 ) { 3997 float_raise( float_flag_invalid STATUS_VAR); 3998 return 0; 3999 } 4000 aSign = extractFloatx80Sign( a ); 4001 bSign = extractFloatx80Sign( b ); 4002 if ( aSign != bSign ) { 4003 return 4004 aSign 4005 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4006 == 0 ); 4007 } 4008 return 4009 aSign ? le128( b.high, b.low, a.high, a.low ) 4010 : le128( a.high, a.low, b.high, b.low ); 4011 4012} 4013 4014/*---------------------------------------------------------------------------- 4015| Returns 1 if the extended double-precision floating-point value `a' is 4016| less than the corresponding value `b', and 0 otherwise. The comparison 4017| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4018| Arithmetic. 4019*----------------------------------------------------------------------------*/ 4020 4021int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 4022{ 4023 flag aSign, bSign; 4024 4025 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4026 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4027 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4028 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4029 ) { 4030 float_raise( float_flag_invalid STATUS_VAR); 4031 return 0; 4032 } 4033 aSign = extractFloatx80Sign( a ); 4034 bSign = extractFloatx80Sign( b ); 4035 if ( aSign != bSign ) { 4036 return 4037 aSign 4038 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4039 != 0 ); 4040 } 4041 return 4042 aSign ? lt128( b.high, b.low, a.high, a.low ) 4043 : lt128( a.high, a.low, b.high, b.low ); 4044 4045} 4046 4047/*---------------------------------------------------------------------------- 4048| Returns 1 if the extended double-precision floating-point value `a' is equal 4049| to the corresponding value `b', and 0 otherwise. The invalid exception is 4050| raised if either operand is a NaN. Otherwise, the comparison is performed 4051| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4052*----------------------------------------------------------------------------*/ 4053 4054int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM ) 4055{ 4056 4057 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4058 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4059 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4060 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4061 ) { 4062 float_raise( float_flag_invalid STATUS_VAR); 4063 return 0; 4064 } 4065 return 4066 ( a.low == b.low ) 4067 && ( ( a.high == b.high ) 4068 || ( ( a.low == 0 ) 4069 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4070 ); 4071 4072} 4073 4074/*---------------------------------------------------------------------------- 4075| Returns 1 if the extended double-precision floating-point value `a' is less 4076| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4077| do not cause an exception. Otherwise, the comparison is performed according 4078| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4079*----------------------------------------------------------------------------*/ 4080 4081int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4082{ 4083 flag aSign, bSign; 4084 4085 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4086 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4087 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4088 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4089 ) { 4090 if ( floatx80_is_signaling_nan( a ) 4091 || floatx80_is_signaling_nan( b ) ) { 4092 float_raise( float_flag_invalid STATUS_VAR); 4093 } 4094 return 0; 4095 } 4096 aSign = extractFloatx80Sign( a ); 4097 bSign = extractFloatx80Sign( b ); 4098 if ( aSign != bSign ) { 4099 return 4100 aSign 4101 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4102 == 0 ); 4103 } 4104 return 4105 aSign ? le128( b.high, b.low, a.high, a.low ) 4106 : le128( a.high, a.low, b.high, b.low ); 4107 4108} 4109 4110/*---------------------------------------------------------------------------- 4111| Returns 1 if the extended double-precision floating-point value `a' is less 4112| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4113| an exception. Otherwise, the comparison is performed according to the 4114| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4115*----------------------------------------------------------------------------*/ 4116 4117int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4118{ 4119 flag aSign, bSign; 4120 4121 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4122 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4123 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4124 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4125 ) { 4126 if ( floatx80_is_signaling_nan( a ) 4127 || floatx80_is_signaling_nan( b ) ) { 4128 float_raise( float_flag_invalid STATUS_VAR); 4129 } 4130 return 0; 4131 } 4132 aSign = extractFloatx80Sign( a ); 4133 bSign = extractFloatx80Sign( b ); 4134 if ( aSign != bSign ) { 4135 return 4136 aSign 4137 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4138 != 0 ); 4139 } 4140 return 4141 aSign ? lt128( b.high, b.low, a.high, a.low ) 4142 : lt128( a.high, a.low, b.high, b.low ); 4143 4144} 4145 4146#endif 4147 4148#ifdef FLOAT128 4149 4150/*---------------------------------------------------------------------------- 4151| Returns the result of converting the quadruple-precision floating-point 4152| value `a' to the 32-bit two's complement integer format. The conversion 4153| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4154| Arithmetic---which means in particular that the conversion is rounded 4155| according to the current rounding mode. If `a' is a NaN, the largest 4156| positive integer is returned. Otherwise, if the conversion overflows, the 4157| largest integer with the same sign as `a' is returned. 4158*----------------------------------------------------------------------------*/ 4159 4160int32 float128_to_int32( float128 a STATUS_PARAM ) 4161{ 4162 flag aSign; 4163 int32 aExp, shiftCount; 4164 bits64 aSig0, aSig1; 4165 4166 aSig1 = extractFloat128Frac1( a ); 4167 aSig0 = extractFloat128Frac0( a ); 4168 aExp = extractFloat128Exp( a ); 4169 aSign = extractFloat128Sign( a ); 4170 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4171 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4172 aSig0 |= ( aSig1 != 0 ); 4173 shiftCount = 0x4028 - aExp; 4174 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4175 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 4176 4177} 4178 4179/*---------------------------------------------------------------------------- 4180| Returns the result of converting the quadruple-precision floating-point 4181| value `a' to the 32-bit two's complement integer format. The conversion 4182| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4183| Arithmetic, except that the conversion is always rounded toward zero. If 4184| `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4185| conversion overflows, the largest integer with the same sign as `a' is 4186| returned. 4187*----------------------------------------------------------------------------*/ 4188 4189int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 4190{ 4191 flag aSign; 4192 int32 aExp, shiftCount; 4193 bits64 aSig0, aSig1, savedASig; 4194 int32 z; 4195 4196 aSig1 = extractFloat128Frac1( a ); 4197 aSig0 = extractFloat128Frac0( a ); 4198 aExp = extractFloat128Exp( a ); 4199 aSign = extractFloat128Sign( a ); 4200 aSig0 |= ( aSig1 != 0 ); 4201 if ( 0x401E < aExp ) { 4202 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4203 goto invalid; 4204 } 4205 else if ( aExp < 0x3FFF ) { 4206 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 4207 return 0; 4208 } 4209 aSig0 |= LIT64( 0x0001000000000000 ); 4210 shiftCount = 0x402F - aExp; 4211 savedASig = aSig0; 4212 aSig0 >>= shiftCount; 4213 z = aSig0; 4214 if ( aSign ) z = - z; 4215 if ( ( z < 0 ) ^ aSign ) { 4216 invalid: 4217 float_raise( float_flag_invalid STATUS_VAR); 4218 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4219 } 4220 if ( ( aSig0<<shiftCount ) != savedASig ) { 4221 STATUS(float_exception_flags) |= float_flag_inexact; 4222 } 4223 return z; 4224 4225} 4226 4227/*---------------------------------------------------------------------------- 4228| Returns the result of converting the quadruple-precision floating-point 4229| value `a' to the 64-bit two's complement integer format. The conversion 4230| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4231| Arithmetic---which means in particular that the conversion is rounded 4232| according to the current rounding mode. If `a' is a NaN, the largest 4233| positive integer is returned. Otherwise, if the conversion overflows, the 4234| largest integer with the same sign as `a' is returned. 4235*----------------------------------------------------------------------------*/ 4236 4237int64 float128_to_int64( float128 a STATUS_PARAM ) 4238{ 4239 flag aSign; 4240 int32 aExp, shiftCount; 4241 bits64 aSig0, aSig1; 4242 4243 aSig1 = extractFloat128Frac1( a ); 4244 aSig0 = extractFloat128Frac0( a ); 4245 aExp = extractFloat128Exp( a ); 4246 aSign = extractFloat128Sign( a ); 4247 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4248 shiftCount = 0x402F - aExp; 4249 if ( shiftCount <= 0 ) { 4250 if ( 0x403E < aExp ) { 4251 float_raise( float_flag_invalid STATUS_VAR); 4252 if ( ! aSign 4253 || ( ( aExp == 0x7FFF ) 4254 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4255 ) 4256 ) { 4257 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4258 } 4259 return (sbits64) LIT64( 0x8000000000000000 ); 4260 } 4261 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4262 } 4263 else { 4264 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4265 } 4266 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 4267 4268} 4269 4270/*---------------------------------------------------------------------------- 4271| Returns the result of converting the quadruple-precision floating-point 4272| value `a' to the 64-bit two's complement integer format. The conversion 4273| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4274| Arithmetic, except that the conversion is always rounded toward zero. 4275| If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4276| the conversion overflows, the largest integer with the same sign as `a' is 4277| returned. 4278*----------------------------------------------------------------------------*/ 4279 4280int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 4281{ 4282 flag aSign; 4283 int32 aExp, shiftCount; 4284 bits64 aSig0, aSig1; 4285 int64 z; 4286 4287 aSig1 = extractFloat128Frac1( a ); 4288 aSig0 = extractFloat128Frac0( a ); 4289 aExp = extractFloat128Exp( a ); 4290 aSign = extractFloat128Sign( a ); 4291 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4292 shiftCount = aExp - 0x402F; 4293 if ( 0 < shiftCount ) { 4294 if ( 0x403E <= aExp ) { 4295 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4296 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4297 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4298 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 4299 } 4300 else { 4301 float_raise( float_flag_invalid STATUS_VAR); 4302 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4303 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4304 } 4305 } 4306 return (sbits64) LIT64( 0x8000000000000000 ); 4307 } 4308 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4309 if ( (bits64) ( aSig1<<shiftCount ) ) { 4310 STATUS(float_exception_flags) |= float_flag_inexact; 4311 } 4312 } 4313 else { 4314 if ( aExp < 0x3FFF ) { 4315 if ( aExp | aSig0 | aSig1 ) { 4316 STATUS(float_exception_flags) |= float_flag_inexact; 4317 } 4318 return 0; 4319 } 4320 z = aSig0>>( - shiftCount ); 4321 if ( aSig1 4322 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4323 STATUS(float_exception_flags) |= float_flag_inexact; 4324 } 4325 } 4326 if ( aSign ) z = - z; 4327 return z; 4328 4329} 4330 4331/*---------------------------------------------------------------------------- 4332| Returns the result of converting the quadruple-precision floating-point 4333| value `a' to the single-precision floating-point format. The conversion 4334| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4335| Arithmetic. 4336*----------------------------------------------------------------------------*/ 4337 4338float32 float128_to_float32( float128 a STATUS_PARAM ) 4339{ 4340 flag aSign; 4341 int32 aExp; 4342 bits64 aSig0, aSig1; 4343 bits32 zSig; 4344 4345 aSig1 = extractFloat128Frac1( a ); 4346 aSig0 = extractFloat128Frac0( a ); 4347 aExp = extractFloat128Exp( a ); 4348 aSign = extractFloat128Sign( a ); 4349 if ( aExp == 0x7FFF ) { 4350 if ( aSig0 | aSig1 ) { 4351 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) ); 4352 } 4353 return packFloat32( aSign, 0xFF, 0 ); 4354 } 4355 aSig0 |= ( aSig1 != 0 ); 4356 shift64RightJamming( aSig0, 18, &aSig0 ); 4357 zSig = aSig0; 4358 if ( aExp || zSig ) { 4359 zSig |= 0x40000000; 4360 aExp -= 0x3F81; 4361 } 4362 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 4363 4364} 4365 4366/*---------------------------------------------------------------------------- 4367| Returns the result of converting the quadruple-precision floating-point 4368| value `a' to the double-precision floating-point format. The conversion 4369| is performed according to the IEC/IEEE Standard for Binary Floating-Point 4370| Arithmetic. 4371*----------------------------------------------------------------------------*/ 4372 4373float64 float128_to_float64( float128 a STATUS_PARAM ) 4374{ 4375 flag aSign; 4376 int32 aExp; 4377 bits64 aSig0, aSig1; 4378 4379 aSig1 = extractFloat128Frac1( a ); 4380 aSig0 = extractFloat128Frac0( a ); 4381 aExp = extractFloat128Exp( a ); 4382 aSign = extractFloat128Sign( a ); 4383 if ( aExp == 0x7FFF ) { 4384 if ( aSig0 | aSig1 ) { 4385 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) ); 4386 } 4387 return packFloat64( aSign, 0x7FF, 0 ); 4388 } 4389 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4390 aSig0 |= ( aSig1 != 0 ); 4391 if ( aExp || aSig0 ) { 4392 aSig0 |= LIT64( 0x4000000000000000 ); 4393 aExp -= 0x3C01; 4394 } 4395 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 4396 4397} 4398 4399#ifdef FLOATX80 4400 4401/*---------------------------------------------------------------------------- 4402| Returns the result of converting the quadruple-precision floating-point 4403| value `a' to the extended double-precision floating-point format. The 4404| conversion is performed according to the IEC/IEEE Standard for Binary 4405| Floating-Point Arithmetic. 4406*----------------------------------------------------------------------------*/ 4407 4408floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 4409{ 4410 flag aSign; 4411 int32 aExp; 4412 bits64 aSig0, aSig1; 4413 4414 aSig1 = extractFloat128Frac1( a ); 4415 aSig0 = extractFloat128Frac0( a ); 4416 aExp = extractFloat128Exp( a ); 4417 aSign = extractFloat128Sign( a ); 4418 if ( aExp == 0x7FFF ) { 4419 if ( aSig0 | aSig1 ) { 4420 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) ); 4421 } 4422 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4423 } 4424 if ( aExp == 0 ) { 4425 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4426 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4427 } 4428 else { 4429 aSig0 |= LIT64( 0x0001000000000000 ); 4430 } 4431 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4432 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 4433 4434} 4435 4436#endif 4437 4438/*---------------------------------------------------------------------------- 4439| Rounds the quadruple-precision floating-point value `a' to an integer, and 4440| returns the result as a quadruple-precision floating-point value. The 4441| operation is performed according to the IEC/IEEE Standard for Binary 4442| Floating-Point Arithmetic. 4443*----------------------------------------------------------------------------*/ 4444 4445float128 float128_round_to_int( float128 a STATUS_PARAM ) 4446{ 4447 flag aSign; 4448 int32 aExp; 4449 bits64 lastBitMask, roundBitsMask; 4450 int8 roundingMode; 4451 float128 z; 4452 4453 aExp = extractFloat128Exp( a ); 4454 if ( 0x402F <= aExp ) { 4455 if ( 0x406F <= aExp ) { 4456 if ( ( aExp == 0x7FFF ) 4457 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4458 ) { 4459 return propagateFloat128NaN( a, a STATUS_VAR ); 4460 } 4461 return a; 4462 } 4463 lastBitMask = 1; 4464 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4465 roundBitsMask = lastBitMask - 1; 4466 z = a; 4467 roundingMode = STATUS(float_rounding_mode); 4468 if ( roundingMode == float_round_nearest_even ) { 4469 if ( lastBitMask ) { 4470 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4471 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4472 } 4473 else { 4474 if ( (sbits64) z.low < 0 ) { 4475 ++z.high; 4476 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4477 } 4478 } 4479 } 4480 else if ( roundingMode != float_round_to_zero ) { 4481 if ( extractFloat128Sign( z ) 4482 ^ ( roundingMode == float_round_up ) ) { 4483 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4484 } 4485 } 4486 z.low &= ~ roundBitsMask; 4487 } 4488 else { 4489 if ( aExp < 0x3FFF ) { 4490 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4491 STATUS(float_exception_flags) |= float_flag_inexact; 4492 aSign = extractFloat128Sign( a ); 4493 switch ( STATUS(float_rounding_mode) ) { 4494 case float_round_nearest_even: 4495 if ( ( aExp == 0x3FFE ) 4496 && ( extractFloat128Frac0( a ) 4497 | extractFloat128Frac1( a ) ) 4498 ) { 4499 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4500 } 4501 break; 4502 case float_round_down: 4503 return 4504 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4505 : packFloat128( 0, 0, 0, 0 ); 4506 case float_round_up: 4507 return 4508 aSign ? packFloat128( 1, 0, 0, 0 ) 4509 : packFloat128( 0, 0x3FFF, 0, 0 ); 4510 } 4511 return packFloat128( aSign, 0, 0, 0 ); 4512 } 4513 lastBitMask = 1; 4514 lastBitMask <<= 0x402F - aExp; 4515 roundBitsMask = lastBitMask - 1; 4516 z.low = 0; 4517 z.high = a.high; 4518 roundingMode = STATUS(float_rounding_mode); 4519 if ( roundingMode == float_round_nearest_even ) { 4520 z.high += lastBitMask>>1; 4521 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4522 z.high &= ~ lastBitMask; 4523 } 4524 } 4525 else if ( roundingMode != float_round_to_zero ) { 4526 if ( extractFloat128Sign( z ) 4527 ^ ( roundingMode == float_round_up ) ) { 4528 z.high |= ( a.low != 0 ); 4529 z.high += roundBitsMask; 4530 } 4531 } 4532 z.high &= ~ roundBitsMask; 4533 } 4534 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4535 STATUS(float_exception_flags) |= float_flag_inexact; 4536 } 4537 return z; 4538 4539} 4540 4541/*---------------------------------------------------------------------------- 4542| Returns the result of adding the absolute values of the quadruple-precision 4543| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4544| before being returned. `zSign' is ignored if the result is a NaN. 4545| The addition is performed according to the IEC/IEEE Standard for Binary 4546| Floating-Point Arithmetic. 4547*----------------------------------------------------------------------------*/ 4548 4549static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 4550{ 4551 int32 aExp, bExp, zExp; 4552 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4553 int32 expDiff; 4554 4555 aSig1 = extractFloat128Frac1( a ); 4556 aSig0 = extractFloat128Frac0( a ); 4557 aExp = extractFloat128Exp( a ); 4558 bSig1 = extractFloat128Frac1( b ); 4559 bSig0 = extractFloat128Frac0( b ); 4560 bExp = extractFloat128Exp( b ); 4561 expDiff = aExp - bExp; 4562 if ( 0 < expDiff ) { 4563 if ( aExp == 0x7FFF ) { 4564 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4565 return a; 4566 } 4567 if ( bExp == 0 ) { 4568 --expDiff; 4569 } 4570 else { 4571 bSig0 |= LIT64( 0x0001000000000000 ); 4572 } 4573 shift128ExtraRightJamming( 4574 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4575 zExp = aExp; 4576 } 4577 else if ( expDiff < 0 ) { 4578 if ( bExp == 0x7FFF ) { 4579 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4580 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4581 } 4582 if ( aExp == 0 ) { 4583 ++expDiff; 4584 } 4585 else { 4586 aSig0 |= LIT64( 0x0001000000000000 ); 4587 } 4588 shift128ExtraRightJamming( 4589 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4590 zExp = bExp; 4591 } 4592 else { 4593 if ( aExp == 0x7FFF ) { 4594 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4595 return propagateFloat128NaN( a, b STATUS_VAR ); 4596 } 4597 return a; 4598 } 4599 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4600 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4601 zSig2 = 0; 4602 zSig0 |= LIT64( 0x0002000000000000 ); 4603 zExp = aExp; 4604 goto shiftRight1; 4605 } 4606 aSig0 |= LIT64( 0x0001000000000000 ); 4607 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4608 --zExp; 4609 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4610 ++zExp; 4611 shiftRight1: 4612 shift128ExtraRightJamming( 4613 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4614 roundAndPack: 4615 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 4616 4617} 4618 4619/*---------------------------------------------------------------------------- 4620| Returns the result of subtracting the absolute values of the quadruple- 4621| precision floating-point values `a' and `b'. If `zSign' is 1, the 4622| difference is negated before being returned. `zSign' is ignored if the 4623| result is a NaN. The subtraction is performed according to the IEC/IEEE 4624| Standard for Binary Floating-Point Arithmetic. 4625*----------------------------------------------------------------------------*/ 4626 4627static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 4628{ 4629 int32 aExp, bExp, zExp; 4630 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4631 int32 expDiff; 4632 float128 z; 4633 4634 aSig1 = extractFloat128Frac1( a ); 4635 aSig0 = extractFloat128Frac0( a ); 4636 aExp = extractFloat128Exp( a ); 4637 bSig1 = extractFloat128Frac1( b ); 4638 bSig0 = extractFloat128Frac0( b ); 4639 bExp = extractFloat128Exp( b ); 4640 expDiff = aExp - bExp; 4641 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4642 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4643 if ( 0 < expDiff ) goto aExpBigger; 4644 if ( expDiff < 0 ) goto bExpBigger; 4645 if ( aExp == 0x7FFF ) { 4646 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4647 return propagateFloat128NaN( a, b STATUS_VAR ); 4648 } 4649 float_raise( float_flag_invalid STATUS_VAR); 4650 z.low = float128_default_nan_low; 4651 z.high = float128_default_nan_high; 4652 return z; 4653 } 4654 if ( aExp == 0 ) { 4655 aExp = 1; 4656 bExp = 1; 4657 } 4658 if ( bSig0 < aSig0 ) goto aBigger; 4659 if ( aSig0 < bSig0 ) goto bBigger; 4660 if ( bSig1 < aSig1 ) goto aBigger; 4661 if ( aSig1 < bSig1 ) goto bBigger; 4662 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 4663 bExpBigger: 4664 if ( bExp == 0x7FFF ) { 4665 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4666 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4667 } 4668 if ( aExp == 0 ) { 4669 ++expDiff; 4670 } 4671 else { 4672 aSig0 |= LIT64( 0x4000000000000000 ); 4673 } 4674 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4675 bSig0 |= LIT64( 0x4000000000000000 ); 4676 bBigger: 4677 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4678 zExp = bExp; 4679 zSign ^= 1; 4680 goto normalizeRoundAndPack; 4681 aExpBigger: 4682 if ( aExp == 0x7FFF ) { 4683 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4684 return a; 4685 } 4686 if ( bExp == 0 ) { 4687 --expDiff; 4688 } 4689 else { 4690 bSig0 |= LIT64( 0x4000000000000000 ); 4691 } 4692 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4693 aSig0 |= LIT64( 0x4000000000000000 ); 4694 aBigger: 4695 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4696 zExp = aExp; 4697 normalizeRoundAndPack: 4698 --zExp; 4699 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 4700 4701} 4702 4703/*---------------------------------------------------------------------------- 4704| Returns the result of adding the quadruple-precision floating-point values 4705| `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4706| for Binary Floating-Point Arithmetic. 4707*----------------------------------------------------------------------------*/ 4708 4709float128 float128_add( float128 a, float128 b STATUS_PARAM ) 4710{ 4711 flag aSign, bSign; 4712 4713 aSign = extractFloat128Sign( a ); 4714 bSign = extractFloat128Sign( b ); 4715 if ( aSign == bSign ) { 4716 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 4717 } 4718 else { 4719 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 4720 } 4721 4722} 4723 4724/*---------------------------------------------------------------------------- 4725| Returns the result of subtracting the quadruple-precision floating-point 4726| values `a' and `b'. The operation is performed according to the IEC/IEEE 4727| Standard for Binary Floating-Point Arithmetic. 4728*----------------------------------------------------------------------------*/ 4729 4730float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 4731{ 4732 flag aSign, bSign; 4733 4734 aSign = extractFloat128Sign( a ); 4735 bSign = extractFloat128Sign( b ); 4736 if ( aSign == bSign ) { 4737 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 4738 } 4739 else { 4740 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 4741 } 4742 4743} 4744 4745/*---------------------------------------------------------------------------- 4746| Returns the result of multiplying the quadruple-precision floating-point 4747| values `a' and `b'. The operation is performed according to the IEC/IEEE 4748| Standard for Binary Floating-Point Arithmetic. 4749*----------------------------------------------------------------------------*/ 4750 4751float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 4752{ 4753 flag aSign, bSign, zSign; 4754 int32 aExp, bExp, zExp; 4755 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4756 float128 z; 4757 4758 aSig1 = extractFloat128Frac1( a ); 4759 aSig0 = extractFloat128Frac0( a ); 4760 aExp = extractFloat128Exp( a ); 4761 aSign = extractFloat128Sign( a ); 4762 bSig1 = extractFloat128Frac1( b ); 4763 bSig0 = extractFloat128Frac0( b ); 4764 bExp = extractFloat128Exp( b ); 4765 bSign = extractFloat128Sign( b ); 4766 zSign = aSign ^ bSign; 4767 if ( aExp == 0x7FFF ) { 4768 if ( ( aSig0 | aSig1 ) 4769 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4770 return propagateFloat128NaN( a, b STATUS_VAR ); 4771 } 4772 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4773 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4774 } 4775 if ( bExp == 0x7FFF ) { 4776 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4777 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4778 invalid: 4779 float_raise( float_flag_invalid STATUS_VAR); 4780 z.low = float128_default_nan_low; 4781 z.high = float128_default_nan_high; 4782 return z; 4783 } 4784 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4785 } 4786 if ( aExp == 0 ) { 4787 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4788 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4789 } 4790 if ( bExp == 0 ) { 4791 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4792 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4793 } 4794 zExp = aExp + bExp - 0x4000; 4795 aSig0 |= LIT64( 0x0001000000000000 ); 4796 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4797 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4798 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4799 zSig2 |= ( zSig3 != 0 ); 4800 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4801 shift128ExtraRightJamming( 4802 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4803 ++zExp; 4804 } 4805 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 4806 4807} 4808 4809/*---------------------------------------------------------------------------- 4810| Returns the result of dividing the quadruple-precision floating-point value 4811| `a' by the corresponding value `b'. The operation is performed according to 4812| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4813*----------------------------------------------------------------------------*/ 4814 4815float128 float128_div( float128 a, float128 b STATUS_PARAM ) 4816{ 4817 flag aSign, bSign, zSign; 4818 int32 aExp, bExp, zExp; 4819 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4820 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4821 float128 z; 4822 4823 aSig1 = extractFloat128Frac1( a ); 4824 aSig0 = extractFloat128Frac0( a ); 4825 aExp = extractFloat128Exp( a ); 4826 aSign = extractFloat128Sign( a ); 4827 bSig1 = extractFloat128Frac1( b ); 4828 bSig0 = extractFloat128Frac0( b ); 4829 bExp = extractFloat128Exp( b ); 4830 bSign = extractFloat128Sign( b ); 4831 zSign = aSign ^ bSign; 4832 if ( aExp == 0x7FFF ) { 4833 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4834 if ( bExp == 0x7FFF ) { 4835 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4836 goto invalid; 4837 } 4838 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4839 } 4840 if ( bExp == 0x7FFF ) { 4841 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4842 return packFloat128( zSign, 0, 0, 0 ); 4843 } 4844 if ( bExp == 0 ) { 4845 if ( ( bSig0 | bSig1 ) == 0 ) { 4846 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4847 invalid: 4848 float_raise( float_flag_invalid STATUS_VAR); 4849 z.low = float128_default_nan_low; 4850 z.high = float128_default_nan_high; 4851 return z; 4852 } 4853 float_raise( float_flag_divbyzero STATUS_VAR); 4854 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4855 } 4856 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4857 } 4858 if ( aExp == 0 ) { 4859 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4860 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4861 } 4862 zExp = aExp - bExp + 0x3FFD; 4863 shortShift128Left( 4864 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4865 shortShift128Left( 4866 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4867 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4868 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4869 ++zExp; 4870 } 4871 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4872 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4873 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4874 while ( (sbits64) rem0 < 0 ) { 4875 --zSig0; 4876 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4877 } 4878 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 4879 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 4880 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 4881 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 4882 while ( (sbits64) rem1 < 0 ) { 4883 --zSig1; 4884 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 4885 } 4886 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4887 } 4888 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 4889 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 4890 4891} 4892 4893/*---------------------------------------------------------------------------- 4894| Returns the remainder of the quadruple-precision floating-point value `a' 4895| with respect to the corresponding value `b'. The operation is performed 4896| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4897*----------------------------------------------------------------------------*/ 4898 4899float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 4900{ 4901 flag aSign, bSign, zSign; 4902 int32 aExp, bExp, expDiff; 4903 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 4904 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 4905 sbits64 sigMean0; 4906 float128 z; 4907 4908 aSig1 = extractFloat128Frac1( a ); 4909 aSig0 = extractFloat128Frac0( a ); 4910 aExp = extractFloat128Exp( a ); 4911 aSign = extractFloat128Sign( a ); 4912 bSig1 = extractFloat128Frac1( b ); 4913 bSig0 = extractFloat128Frac0( b ); 4914 bExp = extractFloat128Exp( b ); 4915 bSign = extractFloat128Sign( b ); 4916 if ( aExp == 0x7FFF ) { 4917 if ( ( aSig0 | aSig1 ) 4918 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4919 return propagateFloat128NaN( a, b STATUS_VAR ); 4920 } 4921 goto invalid; 4922 } 4923 if ( bExp == 0x7FFF ) { 4924 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4925 return a; 4926 } 4927 if ( bExp == 0 ) { 4928 if ( ( bSig0 | bSig1 ) == 0 ) { 4929 invalid: 4930 float_raise( float_flag_invalid STATUS_VAR); 4931 z.low = float128_default_nan_low; 4932 z.high = float128_default_nan_high; 4933 return z; 4934 } 4935 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4936 } 4937 if ( aExp == 0 ) { 4938 if ( ( aSig0 | aSig1 ) == 0 ) return a; 4939 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4940 } 4941 expDiff = aExp - bExp; 4942 if ( expDiff < -1 ) return a; 4943 shortShift128Left( 4944 aSig0 | LIT64( 0x0001000000000000 ), 4945 aSig1, 4946 15 - ( expDiff < 0 ), 4947 &aSig0, 4948 &aSig1 4949 ); 4950 shortShift128Left( 4951 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4952 q = le128( bSig0, bSig1, aSig0, aSig1 ); 4953 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 4954 expDiff -= 64; 4955 while ( 0 < expDiff ) { 4956 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4957 q = ( 4 < q ) ? q - 4 : 0; 4958 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 4959 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 4960 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 4961 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 4962 expDiff -= 61; 4963 } 4964 if ( -64 < expDiff ) { 4965 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4966 q = ( 4 < q ) ? q - 4 : 0; 4967 q >>= - expDiff; 4968 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 4969 expDiff += 52; 4970 if ( expDiff < 0 ) { 4971 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4972 } 4973 else { 4974 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 4975 } 4976 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 4977 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 4978 } 4979 else { 4980 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 4981 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 4982 } 4983 do { 4984 alternateASig0 = aSig0; 4985 alternateASig1 = aSig1; 4986 ++q; 4987 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 4988 } while ( 0 <= (sbits64) aSig0 ); 4989 add128( 4990 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 4991 if ( ( sigMean0 < 0 ) 4992 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 4993 aSig0 = alternateASig0; 4994 aSig1 = alternateASig1; 4995 } 4996 zSign = ( (sbits64) aSig0 < 0 ); 4997 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 4998 return 4999 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 5000 5001} 5002 5003/*---------------------------------------------------------------------------- 5004| Returns the square root of the quadruple-precision floating-point value `a'. 5005| The operation is performed according to the IEC/IEEE Standard for Binary 5006| Floating-Point Arithmetic. 5007*----------------------------------------------------------------------------*/ 5008 5009float128 float128_sqrt( float128 a STATUS_PARAM ) 5010{ 5011 flag aSign; 5012 int32 aExp, zExp; 5013 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5014 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5015 float128 z; 5016 5017 aSig1 = extractFloat128Frac1( a ); 5018 aSig0 = extractFloat128Frac0( a ); 5019 aExp = extractFloat128Exp( a ); 5020 aSign = extractFloat128Sign( a ); 5021 if ( aExp == 0x7FFF ) { 5022 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 5023 if ( ! aSign ) return a; 5024 goto invalid; 5025 } 5026 if ( aSign ) { 5027 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5028 invalid: 5029 float_raise( float_flag_invalid STATUS_VAR); 5030 z.low = float128_default_nan_low; 5031 z.high = float128_default_nan_high; 5032 return z; 5033 } 5034 if ( aExp == 0 ) { 5035 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5036 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5037 } 5038 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5039 aSig0 |= LIT64( 0x0001000000000000 ); 5040 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5041 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5042 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5043 doubleZSig0 = zSig0<<1; 5044 mul64To128( zSig0, zSig0, &term0, &term1 ); 5045 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5046 while ( (sbits64) rem0 < 0 ) { 5047 --zSig0; 5048 doubleZSig0 -= 2; 5049 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5050 } 5051 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5052 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5053 if ( zSig1 == 0 ) zSig1 = 1; 5054 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5055 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5056 mul64To128( zSig1, zSig1, &term2, &term3 ); 5057 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5058 while ( (sbits64) rem1 < 0 ) { 5059 --zSig1; 5060 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5061 term3 |= 1; 5062 term2 |= doubleZSig0; 5063 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5064 } 5065 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5066 } 5067 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5068 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5069 5070} 5071 5072/*---------------------------------------------------------------------------- 5073| Returns 1 if the quadruple-precision floating-point value `a' is equal to 5074| the corresponding value `b', and 0 otherwise. The comparison is performed 5075| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5076*----------------------------------------------------------------------------*/ 5077 5078int float128_eq( float128 a, float128 b STATUS_PARAM ) 5079{ 5080 5081 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5082 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5083 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5084 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5085 ) { 5086 if ( float128_is_signaling_nan( a ) 5087 || float128_is_signaling_nan( b ) ) { 5088 float_raise( float_flag_invalid STATUS_VAR); 5089 } 5090 return 0; 5091 } 5092 return 5093 ( a.low == b.low ) 5094 && ( ( a.high == b.high ) 5095 || ( ( a.low == 0 ) 5096 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5097 ); 5098 5099} 5100 5101/*---------------------------------------------------------------------------- 5102| Returns 1 if the quadruple-precision floating-point value `a' is less than 5103| or equal to the corresponding value `b', and 0 otherwise. The comparison 5104| is performed according to the IEC/IEEE Standard for Binary Floating-Point 5105| Arithmetic. 5106*----------------------------------------------------------------------------*/ 5107 5108int float128_le( float128 a, float128 b STATUS_PARAM ) 5109{ 5110 flag aSign, bSign; 5111 5112 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5113 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5114 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5115 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5116 ) { 5117 float_raise( float_flag_invalid STATUS_VAR); 5118 return 0; 5119 } 5120 aSign = extractFloat128Sign( a ); 5121 bSign = extractFloat128Sign( b ); 5122 if ( aSign != bSign ) { 5123 return 5124 aSign 5125 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5126 == 0 ); 5127 } 5128 return 5129 aSign ? le128( b.high, b.low, a.high, a.low ) 5130 : le128( a.high, a.low, b.high, b.low ); 5131 5132} 5133 5134/*---------------------------------------------------------------------------- 5135| Returns 1 if the quadruple-precision floating-point value `a' is less than 5136| the corresponding value `b', and 0 otherwise. The comparison is performed 5137| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5138*----------------------------------------------------------------------------*/ 5139 5140int float128_lt( float128 a, float128 b STATUS_PARAM ) 5141{ 5142 flag aSign, bSign; 5143 5144 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5145 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5146 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5147 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5148 ) { 5149 float_raise( float_flag_invalid STATUS_VAR); 5150 return 0; 5151 } 5152 aSign = extractFloat128Sign( a ); 5153 bSign = extractFloat128Sign( b ); 5154 if ( aSign != bSign ) { 5155 return 5156 aSign 5157 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5158 != 0 ); 5159 } 5160 return 5161 aSign ? lt128( b.high, b.low, a.high, a.low ) 5162 : lt128( a.high, a.low, b.high, b.low ); 5163 5164} 5165 5166/*---------------------------------------------------------------------------- 5167| Returns 1 if the quadruple-precision floating-point value `a' is equal to 5168| the corresponding value `b', and 0 otherwise. The invalid exception is 5169| raised if either operand is a NaN. Otherwise, the comparison is performed 5170| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5171*----------------------------------------------------------------------------*/ 5172 5173int float128_eq_signaling( float128 a, float128 b STATUS_PARAM ) 5174{ 5175 5176 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5177 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5178 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5179 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5180 ) { 5181 float_raise( float_flag_invalid STATUS_VAR); 5182 return 0; 5183 } 5184 return 5185 ( a.low == b.low ) 5186 && ( ( a.high == b.high ) 5187 || ( ( a.low == 0 ) 5188 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5189 ); 5190 5191} 5192 5193/*---------------------------------------------------------------------------- 5194| Returns 1 if the quadruple-precision floating-point value `a' is less than 5195| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5196| cause an exception. Otherwise, the comparison is performed according to the 5197| IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5198*----------------------------------------------------------------------------*/ 5199 5200int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 5201{ 5202 flag aSign, bSign; 5203 5204 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5205 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5206 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5207 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5208 ) { 5209 if ( float128_is_signaling_nan( a ) 5210 || float128_is_signaling_nan( b ) ) { 5211 float_raise( float_flag_invalid STATUS_VAR); 5212 } 5213 return 0; 5214 } 5215 aSign = extractFloat128Sign( a ); 5216 bSign = extractFloat128Sign( b ); 5217 if ( aSign != bSign ) { 5218 return 5219 aSign 5220 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5221 == 0 ); 5222 } 5223 return 5224 aSign ? le128( b.high, b.low, a.high, a.low ) 5225 : le128( a.high, a.low, b.high, b.low ); 5226 5227} 5228 5229/*---------------------------------------------------------------------------- 5230| Returns 1 if the quadruple-precision floating-point value `a' is less than 5231| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5232| exception. Otherwise, the comparison is performed according to the IEC/IEEE 5233| Standard for Binary Floating-Point Arithmetic. 5234*----------------------------------------------------------------------------*/ 5235 5236int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 5237{ 5238 flag aSign, bSign; 5239 5240 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5241 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5242 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5243 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5244 ) { 5245 if ( float128_is_signaling_nan( a ) 5246 || float128_is_signaling_nan( b ) ) { 5247 float_raise( float_flag_invalid STATUS_VAR); 5248 } 5249 return 0; 5250 } 5251 aSign = extractFloat128Sign( a ); 5252 bSign = extractFloat128Sign( b ); 5253 if ( aSign != bSign ) { 5254 return 5255 aSign 5256 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5257 != 0 ); 5258 } 5259 return 5260 aSign ? lt128( b.high, b.low, a.high, a.low ) 5261 : lt128( a.high, a.low, b.high, b.low ); 5262 5263} 5264 5265#endif 5266 5267/* misc functions */ 5268float32 uint32_to_float32( unsigned int a STATUS_PARAM ) 5269{ 5270 return int64_to_float32(a STATUS_VAR); 5271} 5272 5273float64 uint32_to_float64( unsigned int a STATUS_PARAM ) 5274{ 5275 return int64_to_float64(a STATUS_VAR); 5276} 5277 5278unsigned int float32_to_uint32( float32 a STATUS_PARAM ) 5279{ 5280 int64_t v; 5281 unsigned int res; 5282 5283 v = float32_to_int64(a STATUS_VAR); 5284 if (v < 0) { 5285 res = 0; 5286 float_raise( float_flag_invalid STATUS_VAR); 5287 } else if (v > 0xffffffff) { 5288 res = 0xffffffff; 5289 float_raise( float_flag_invalid STATUS_VAR); 5290 } else { 5291 res = v; 5292 } 5293 return res; 5294} 5295 5296unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 5297{ 5298 int64_t v; 5299 unsigned int res; 5300 5301 v = float32_to_int64_round_to_zero(a STATUS_VAR); 5302 if (v < 0) { 5303 res = 0; 5304 float_raise( float_flag_invalid STATUS_VAR); 5305 } else if (v > 0xffffffff) { 5306 res = 0xffffffff; 5307 float_raise( float_flag_invalid STATUS_VAR); 5308 } else { 5309 res = v; 5310 } 5311 return res; 5312} 5313 5314unsigned int float64_to_uint32( float64 a STATUS_PARAM ) 5315{ 5316 int64_t v; 5317 unsigned int res; 5318 5319 v = float64_to_int64(a STATUS_VAR); 5320 if (v < 0) { 5321 res = 0; 5322 float_raise( float_flag_invalid STATUS_VAR); 5323 } else if (v > 0xffffffff) { 5324 res = 0xffffffff; 5325 float_raise( float_flag_invalid STATUS_VAR); 5326 } else { 5327 res = v; 5328 } 5329 return res; 5330} 5331 5332unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 5333{ 5334 int64_t v; 5335 unsigned int res; 5336 5337 v = float64_to_int64_round_to_zero(a STATUS_VAR); 5338 if (v < 0) { 5339 res = 0; 5340 float_raise( float_flag_invalid STATUS_VAR); 5341 } else if (v > 0xffffffff) { 5342 res = 0xffffffff; 5343 float_raise( float_flag_invalid STATUS_VAR); 5344 } else { 5345 res = v; 5346 } 5347 return res; 5348} 5349 5350/* FIXME: This looks broken. */ 5351uint64_t float64_to_uint64 (float64 a STATUS_PARAM) 5352{ 5353 int64_t v; 5354 5355 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5356 v += float64_val(a); 5357 v = float64_to_int64(make_float64(v) STATUS_VAR); 5358 5359 return v - INT64_MIN; 5360} 5361 5362uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 5363{ 5364 int64_t v; 5365 5366 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5367 v += float64_val(a); 5368 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); 5369 5370 return v - INT64_MIN; 5371} 5372 5373#define COMPARE(s, nan_exp) \ 5374INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 5375 int is_quiet STATUS_PARAM ) \ 5376{ \ 5377 flag aSign, bSign; \ 5378 bits ## s av, bv; \ 5379 \ 5380 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 5381 extractFloat ## s ## Frac( a ) ) || \ 5382 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 5383 extractFloat ## s ## Frac( b ) )) { \ 5384 if (!is_quiet || \ 5385 float ## s ## _is_signaling_nan( a ) || \ 5386 float ## s ## _is_signaling_nan( b ) ) { \ 5387 float_raise( float_flag_invalid STATUS_VAR); \ 5388 } \ 5389 return float_relation_unordered; \ 5390 } \ 5391 aSign = extractFloat ## s ## Sign( a ); \ 5392 bSign = extractFloat ## s ## Sign( b ); \ 5393 av = float ## s ## _val(a); \ 5394 bv = float ## s ## _val(b); \ 5395 if ( aSign != bSign ) { \ 5396 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \ 5397 /* zero case */ \ 5398 return float_relation_equal; \ 5399 } else { \ 5400 return 1 - (2 * aSign); \ 5401 } \ 5402 } else { \ 5403 if (av == bv) { \ 5404 return float_relation_equal; \ 5405 } else { \ 5406 return 1 - 2 * (aSign ^ ( av < bv )); \ 5407 } \ 5408 } \ 5409} \ 5410 \ 5411int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 5412{ \ 5413 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 5414} \ 5415 \ 5416int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 5417{ \ 5418 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 5419} 5420 5421COMPARE(32, 0xff) 5422COMPARE(64, 0x7ff) 5423 5424INLINE int float128_compare_internal( float128 a, float128 b, 5425 int is_quiet STATUS_PARAM ) 5426{ 5427 flag aSign, bSign; 5428 5429 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 5430 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 5431 ( ( extractFloat128Exp( b ) == 0x7fff ) && 5432 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 5433 if (!is_quiet || 5434 float128_is_signaling_nan( a ) || 5435 float128_is_signaling_nan( b ) ) { 5436 float_raise( float_flag_invalid STATUS_VAR); 5437 } 5438 return float_relation_unordered; 5439 } 5440 aSign = extractFloat128Sign( a ); 5441 bSign = extractFloat128Sign( b ); 5442 if ( aSign != bSign ) { 5443 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 5444 /* zero case */ 5445 return float_relation_equal; 5446 } else { 5447 return 1 - (2 * aSign); 5448 } 5449 } else { 5450 if (a.low == b.low && a.high == b.high) { 5451 return float_relation_equal; 5452 } else { 5453 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 5454 } 5455 } 5456} 5457 5458int float128_compare( float128 a, float128 b STATUS_PARAM ) 5459{ 5460 return float128_compare_internal(a, b, 0 STATUS_VAR); 5461} 5462 5463int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 5464{ 5465 return float128_compare_internal(a, b, 1 STATUS_VAR); 5466} 5467 5468/* Multiply A by 2 raised to the power N. */ 5469float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 5470{ 5471 flag aSign; 5472 int16 aExp; 5473 bits32 aSig; 5474 5475 aSig = extractFloat32Frac( a ); 5476 aExp = extractFloat32Exp( a ); 5477 aSign = extractFloat32Sign( a ); 5478 5479 if ( aExp == 0xFF ) { 5480 return a; 5481 } 5482 aExp += n; 5483 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 5484} 5485 5486float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 5487{ 5488 flag aSign; 5489 int16 aExp; 5490 bits64 aSig; 5491 5492 aSig = extractFloat64Frac( a ); 5493 aExp = extractFloat64Exp( a ); 5494 aSign = extractFloat64Sign( a ); 5495 5496 if ( aExp == 0x7FF ) { 5497 return a; 5498 } 5499 aExp += n; 5500 return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 5501} 5502 5503#ifdef FLOATX80 5504floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 5505{ 5506 flag aSign; 5507 int16 aExp; 5508 bits64 aSig; 5509 5510 aSig = extractFloatx80Frac( a ); 5511 aExp = extractFloatx80Exp( a ); 5512 aSign = extractFloatx80Sign( a ); 5513 5514 if ( aExp == 0x7FF ) { 5515 return a; 5516 } 5517 aExp += n; 5518 return roundAndPackFloatx80( STATUS(floatx80_rounding_precision), 5519 aSign, aExp, aSig, 0 STATUS_VAR ); 5520} 5521#endif 5522 5523#ifdef FLOAT128 5524float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 5525{ 5526 flag aSign; 5527 int32 aExp; 5528 bits64 aSig0, aSig1; 5529 5530 aSig1 = extractFloat128Frac1( a ); 5531 aSig0 = extractFloat128Frac0( a ); 5532 aExp = extractFloat128Exp( a ); 5533 aSign = extractFloat128Sign( a ); 5534 if ( aExp == 0x7FFF ) { 5535 return a; 5536 } 5537 aExp += n; 5538 return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR ); 5539 5540} 5541#endif 5542