uvmac/src/FPMATHEM.h

6265 lines
184 KiB
C

/*
FPMATHEM.h
Copyright (C) 2007 John R. Hauser, Stanislav Shwartsman,
Ross Martin, Paul C. Pratt
You can redistribute this file and/or modify it under the terms
of version 2 of the GNU General Public License as published by
the Free Software Foundation. You should have received a copy
of the license along with this file; see the file COPYING.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
license for more details.
*/
/*
Floating Point MATH implemented with software EMulation
Based on the SoftFloat IEC/IEEE Floating-point Arithmetic
Package, Release 2b, written by John R. Hauser.
Also uses extensions to SoftFloat, written for
Bochs (x86 achitecture simulator), by Stanislav Shwartsman.
*/
/*
original SoftFloat Copyright notice:
Written by John R. Hauser. This work was made possible in part by the
International Computer Science Institute, located at Suite 600, 1947 Center
Street, Berkeley, California 94704. Funding was partially provided by the
National Science Foundation under grant MIP-9311980. The original version
of this code was written as part of a project to build a fixed-point vector
processor in collaboration with the University of California at Berkeley,
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
arithmetic/SoftFloat.html'.
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
Derivative works are acceptable, even for commercial purposes, so long as
(1) the source code for the derivative work includes prominent notice that
the work is derivative, and (2) the source code includes prominent notice with
these four paragraphs for those parts of this code that are retained.
*/
/*
original Stanislav Shwartsman Copyright notice:
This source file is an extension to the SoftFloat IEC/IEEE Floating-point
Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
floating point emulation.
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
Derivative works are acceptable, even for commercial purposes, so long as
(1) the source code for the derivative work includes prominent notice that
the work is derivative, and (2) the source code includes prominent notice with
these four paragraphs for those parts of this code that are retained.
.*============================================================================
* Written for Bochs (x86 achitecture simulator) by
* Stanislav Shwartsman [sshwarts at sourceforge net]
* ==========================================================================*.
*/
#include "SYSDEPNS.h"
/*
ReportAbnormalID unused 0x0204 - 0x02FF
*/
/* soft float stuff */
/*
should avoid 64 bit literals.
*/
typedef uint8_t flag; /* 0/1 */
/*
To fix: softfloat representation of
denormalized extended precision numbers
is different than on 68881.
*/
#define cIncludeFPUUnused cIncludeUnused
/* ----- from original file "softfloat.h" ----- */
/*======================================================================
This C header file is part of the SoftFloat IEC/IEEE Floating-point
Arithmetic Package, Release 2b.
["original SoftFloat Copyright notice" went here, included near top of
this file.]
======================================================================*/
/*----------------------------------------------------------------------------
| The macro `FLOATX80' must be defined to enable the extended double-precision
| floating-point format `floatx80'. If this macro is not defined, the
| `floatx80' type will not be defined, and none of the functions that either
| input or output the `floatx80' type will be defined. The same applies to
| the `FLOAT128' macro and the quadruple-precision format `float128'.
*----------------------------------------------------------------------------*/
#define FLOATX80
#define FLOAT128
/*----------------------------------------------------------------------------
| Software IEC/IEEE floating-point types.
*----------------------------------------------------------------------------*/
typedef struct {
uint64_t low;
unsigned short high;
} floatx80;
#ifdef FLOAT128
typedef struct {
uint64_t low, high;
} float128;
#endif
/* ----- end from original file "softfloat.h" ----- */
/*----------------------------------------------------------------------------
| Software IEC/IEEE floating-point rounding mode.
*----------------------------------------------------------------------------*/
enum {
float_round_nearest_even = 0,
float_round_down = 1,
float_round_up = 2,
float_round_to_zero = 3
};
/*----------------------------------------------------------------------------
| Floating-point rounding mode, extended double-precision rounding precision,
| and exception flags.
*----------------------------------------------------------------------------*/
LOCALVAR int8_t float_rounding_mode = float_round_nearest_even;
/*----------------------------------------------------------------------------
| Software IEC/IEEE floating-point exception flags.
*----------------------------------------------------------------------------*/
enum {
float_flag_invalid = 1,
float_flag_divbyzero = 4,
float_flag_overflow = 8,
float_flag_underflow = 16,
float_flag_inexact = 32
};
LOCALVAR int8_t float_exception_flags = 0;
/*----------------------------------------------------------------------------
| Software IEC/IEEE extended double-precision rounding precision. Valid
| values are 32, 64, and 80.
*----------------------------------------------------------------------------*/
LOCALVAR int8_t floatx80_rounding_precision = 80;
/*----------------------------------------------------------------------------
| Primitive arithmetic functions, including multi-word arithmetic, and
| division and square root approximations. (Can be specialized to target if
| desired.)
*----------------------------------------------------------------------------*/
/* ----- from original file "softfloat-macros" ----- */
/*============================================================================
This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
Arithmetic Package, Release 2b.
["original SoftFloat Copyright notice" went here, included near top of this file.]
=============================================================================*/
/*----------------------------------------------------------------------------
| Shifts `a' right by the number of bits given in `count'. If any nonzero
| bits are shifted off, they are ``jammed'' into the least significant bit of
| the result by setting the least significant bit to 1. The value of `count'
| can be arbitrarily large; in particular, if `count' is greater than 32, the
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
| The result is stored in the location pointed to by `zPtr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shift32RightJamming( uint32_t a, int16_t count, uint32_t *zPtr )
{
uint32_t z;
if ( count == 0 ) {
z = a;
}
else if ( count < 32 ) {
z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
}
else {
z = ( a != 0 );
}
*zPtr = z;
}
/*----------------------------------------------------------------------------
| Shifts `a' right by the number of bits given in `count'. If any nonzero
| bits are shifted off, they are ``jammed'' into the least significant bit of
| the result by setting the least significant bit to 1. The value of `count'
| can be arbitrarily large; in particular, if `count' is greater than 64, the
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
| The result is stored in the location pointed to by `zPtr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shift64RightJamming( uint64_t a, int16_t count, uint64_t *zPtr )
{
uint64_t z;
if ( count == 0 ) {
z = a;
}
else if ( count < 64 ) {
z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
}
else {
z = ( a != 0 );
}
*zPtr = z;
}
/*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
| _plus_ the number of bits given in `count'. The shifted result is at most
| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
| bits shifted off form a second 64-bit result as follows: The _last_ bit
| shifted off is the most-significant bit of the extra result, and the other
| 63 bits of the extra result are all zero if and only if _all_but_the_last_
| bits shifted off were all zero. This extra result is stored in the location
| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
| (This routine makes more sense if `a0' and `a1' are considered to form
| a fixed-point value with binary point between `a0' and `a1'. This fixed-
| point value is shifted right by the number of bits given in `count', and
| the integer part of the result is returned at the location pointed to by
| `z0Ptr'. The fractional part of the result may be slightly corrupted as
| described above, and is returned at the location pointed to by `z1Ptr'.)
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shift64ExtraRightJamming(
uint64_t a0, uint64_t a1, int16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
uint64_t z0, z1;
int8_t negCount = ( - count ) & 63;
if ( count == 0 ) {
z1 = a1;
z0 = a0;
}
else if ( count < 64 ) {
z1 = ( a0<<negCount ) | ( a1 != 0 );
z0 = a0>>count;
}
else {
if ( count == 64 ) {
z1 = a0 | ( a1 != 0 );
}
else {
z1 = ( ( a0 | a1 ) != 0 );
}
z0 = 0;
}
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
| number of bits given in `count'. Any bits shifted off are lost. The value
| of `count' can be arbitrarily large; in particular, if `count' is greater
| than 128, the result will be 0. The result is broken into two 64-bit pieces
| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shift128Right(
uint64_t a0, uint64_t a1, int16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
uint64_t z0, z1;
int8_t negCount = ( - count ) & 63;
if ( count == 0 ) {
z1 = a1;
z0 = a0;
}
else if ( count < 64 ) {
z1 = ( a0<<negCount ) | ( a1>>count );
z0 = a0>>count;
}
else {
z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
z0 = 0;
}
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
| number of bits given in `count'. If any nonzero bits are shifted off, they
| are ``jammed'' into the least significant bit of the result by setting the
| least significant bit to 1. The value of `count' can be arbitrarily large;
| in particular, if `count' is greater than 128, the result will be either
| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
| nonzero. The result is broken into two 64-bit pieces which are stored at
| the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shift128RightJamming(
uint64_t a0, uint64_t a1, int16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
uint64_t z0, z1;
int8_t negCount = ( - count ) & 63;
if ( count == 0 ) {
z1 = a1;
z0 = a0;
}
else if ( count < 64 ) {
z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
z0 = a0>>count;
}
else {
if ( count == 64 ) {
z1 = a0 | ( a1 != 0 );
}
else if ( count < 128 ) {
z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
}
else {
z1 = ( ( a0 | a1 ) != 0 );
}
z0 = 0;
}
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
| by 64 _plus_ the number of bits given in `count'. The shifted result is
| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
| stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
| off form a third 64-bit result as follows: The _last_ bit shifted off is
| the most-significant bit of the extra result, and the other 63 bits of the
| extra result are all zero if and only if _all_but_the_last_ bits shifted off
| were all zero. This extra result is stored in the location pointed to by
| `z2Ptr'. The value of `count' can be arbitrarily large.
| (This routine makes more sense if `a0', `a1', and `a2' are considered
| to form a fixed-point value with binary point between `a1' and `a2'. This
| fixed-point value is shifted right by the number of bits given in `count',
| and the integer part of the result is returned at the locations pointed to
| by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
| corrupted as described above, and is returned at the location pointed to by
| `z2Ptr'.)
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shift128ExtraRightJamming(
uint64_t a0,
uint64_t a1,
uint64_t a2,
int16_t count,
uint64_t *z0Ptr,
uint64_t *z1Ptr,
uint64_t *z2Ptr)
{
uint64_t z0, z1, z2;
int8_t negCount = ( - count ) & 63;
if ( count == 0 ) {
z2 = a2;
z1 = a1;
z0 = a0;
}
else {
if ( count < 64 ) {
z2 = a1<<negCount;
z1 = ( a0<<negCount ) | ( a1>>count );
z0 = a0>>count;
}
else {
if ( count == 64 ) {
z2 = a1;
z1 = a0;
}
else {
a2 |= a1;
if ( count < 128 ) {
z2 = a0<<negCount;
z1 = a0>>( count & 63 );
}
else {
z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
z1 = 0;
}
}
z0 = 0;
}
z2 |= ( a2 != 0 );
}
*z2Ptr = z2;
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
| number of bits given in `count'. Any bits shifted off are lost. The value
| of `count' must be less than 64. The result is broken into two 64-bit
| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC shortShift128Left(
uint64_t a0, uint64_t a1, int16_t count, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
*z1Ptr = a1<<count;
*z0Ptr =
( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) );
}
/*----------------------------------------------------------------------------
| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
| any carry out is lost. The result is broken into two 64-bit pieces which
| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC add128(
uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
uint64_t z1;
z1 = a1 + b1;
*z1Ptr = z1;
*z0Ptr = a0 + b0 + ( z1 < a1 );
}
/*----------------------------------------------------------------------------
| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
| 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
| modulo 2^192, so any carry out is lost. The result is broken into three
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
| `z1Ptr', and `z2Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC add192(
uint64_t a0,
uint64_t a1,
uint64_t a2,
uint64_t b0,
uint64_t b1,
uint64_t b2,
uint64_t *z0Ptr,
uint64_t *z1Ptr,
uint64_t *z2Ptr)
{
uint64_t z0, z1, z2;
int8_t carry0, carry1;
z2 = a2 + b2;
carry1 = ( z2 < a2 );
z1 = a1 + b1;
carry0 = ( z1 < a1 );
z0 = a0 + b0;
z1 += carry1;
z0 += ( z1 < carry1 );
z0 += carry0;
*z2Ptr = z2;
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
| 2^128, so any borrow out (carry out) is lost. The result is broken into two
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
| `z1Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC
sub128(
uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
*z1Ptr = a1 - b1;
*z0Ptr = a0 - b0 - ( a1 < b1 );
}
/*----------------------------------------------------------------------------
| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
| Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
| result is broken into three 64-bit pieces which are stored at the locations
| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC
sub192(
uint64_t a0,
uint64_t a1,
uint64_t a2,
uint64_t b0,
uint64_t b1,
uint64_t b2,
uint64_t *z0Ptr,
uint64_t *z1Ptr,
uint64_t *z2Ptr
)
{
uint64_t z0, z1, z2;
int8_t borrow0, borrow1;
z2 = a2 - b2;
borrow1 = ( a2 < b2 );
z1 = a1 - b1;
borrow0 = ( a1 < b1 );
z0 = a0 - b0;
z0 -= ( z1 < borrow1 );
z1 -= borrow1;
z0 -= borrow0;
*z2Ptr = z2;
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
| into two 64-bit pieces which are stored at the locations pointed to by
| `z0Ptr' and `z1Ptr'.
*----------------------------------------------------------------------------*/
#ifndef HaveUi5to6Mul
#define HaveUi5to6Mul 1
#endif
#if HaveUi5to6Mul
LOCALINLINEPROC Ui5to6Mul( uint32_t src1, uint32_t src2, uint64_t *z)
{
*z = ((uint64_t) src1) * src2;
}
#else
LOCALINLINEPROC Ui6fromHiLo(uint32_t hi, uint32_t lo, uint64_t *z)
{
*z = (((uint64_t)(hi)) << 32) + lo;
#if 0
z->lo = hi;
z->hi = lo;
#endif
}
LOCALPROC Ui5to6Mul( uint32_t src1, uint32_t src2, uint64_t *z)
{
uint16_t src1_lo = uint32_t_lo(src1);
uint16_t src2_lo = uint32_t_lo(src2);
uint16_t src1_hi = uint32_t_hi(src1);
uint16_t src2_hi = uint32_t_hi(src2);
uint32_t r0 = ( (uint64_t) src1_lo ) * src2_lo;
uint32_t r1 = ( (uint64_t) src1_hi ) * src2_lo;
uint32_t r2 = ( (uint64_t) src1_lo ) * src2_hi;
uint32_t r3 = ( (uint64_t) src1_hi ) * src2_hi;
uint32_t ra1 = uint32_t_hi(r0) + uint32_t_lo(r1) + uint32_t_lo(r2);
uint32_t lo = (uint32_t_lo(ra1) << 16) | uint32_t_lo(r0);
uint32_t hi = uint32_t_hi(ra1) + uint32_t_hi(r1) + uint32_t_hi(r2) + r3;
Ui6fromHiLo(hi, lo, z);
}
#endif
LOCALINLINEPROC mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
{
uint32_t aHigh, aLow, bHigh, bLow;
uint64_t z0, zMiddleA, zMiddleB, z1;
aLow = a;
aHigh = a>>32;
bLow = b;
bHigh = b>>32;
Ui5to6Mul(aLow, bLow, &z1);
Ui5to6Mul(aLow, bHigh, &zMiddleA);
Ui5to6Mul(aHigh, bLow, &zMiddleB);
Ui5to6Mul(aHigh, bHigh, &z0);
zMiddleA += zMiddleB;
z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
zMiddleA <<= 32;
z1 += zMiddleA;
z0 += ( z1 < zMiddleA );
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
| `b' to obtain a 192-bit product. The product is broken into three 64-bit
| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
| `z2Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC
mul128By64To192(
uint64_t a0,
uint64_t a1,
uint64_t b,
uint64_t *z0Ptr,
uint64_t *z1Ptr,
uint64_t *z2Ptr
)
{
uint64_t z0, z1, z2, more1;
mul64To128( a1, b, &z1, &z2 );
mul64To128( a0, b, &z0, &more1 );
add128( z0, more1, 0, z1, &z0, &z1 );
*z2Ptr = z2;
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
| product. The product is broken into four 64-bit pieces which are stored at
| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
*----------------------------------------------------------------------------*/
LOCALINLINEPROC
mul128To256(
uint64_t a0,
uint64_t a1,
uint64_t b0,
uint64_t b1,
uint64_t *z0Ptr,
uint64_t *z1Ptr,
uint64_t *z2Ptr,
uint64_t *z3Ptr
)
{
uint64_t z0, z1, z2, z3;
uint64_t more1, more2;
mul64To128( a1, b1, &z2, &z3 );
mul64To128( a1, b0, &z1, &more2 );
add128( z1, more2, 0, z2, &z1, &z2 );
mul64To128( a0, b0, &z0, &more1 );
add128( z0, more1, 0, z1, &z0, &z1 );
mul64To128( a0, b1, &more1, &more2 );
add128( more1, more2, 0, z2, &more1, &z2 );
add128( z0, z1, 0, more1, &z0, &z1 );
*z3Ptr = z3;
*z2Ptr = z2;
*z1Ptr = z1;
*z0Ptr = z0;
}
/*----------------------------------------------------------------------------
| Returns an approximation to the 64-bit integer quotient obtained by dividing
| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
| divisor `b' must be at least 2^63. If q is the exact quotient truncated
| toward zero, the approximation returned lies between q and q + 2 inclusive.
| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
| unsigned integer is returned.
*----------------------------------------------------------------------------*/
#ifndef HaveUi6Div
#define HaveUi6Div 0
#endif
#if HaveUi6Div
#define Ui6Div(x, y) ((x) / (y))
#else
/*
Assuming other 64 bit operations available,
like compare, subtract, shift.
*/
LOCALFUNC uint64_t Ui6Div(uint64_t num, uint64_t den)
{
uint64_t bit = 1;
uint64_t res = 0;
while ((den < num) && bit && ! (den & (LIT64(1) << 63))) {
den <<= 1;
bit <<= 1;
}
while (bit) {
if (num >= den) {
num -= den;
res |= bit;
}
bit >>= 1;
den >>= 1;
}
return res;
}
#endif
LOCALFUNC uint64_t estimateDiv128To64( uint64_t a0, uint64_t a1, uint64_t b )
{
uint64_t b0, b1;
uint64_t rem0, rem1, term0, term1;
uint64_t z;
if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
b0 = b>>32;
z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : Ui6Div(a0, b0) << 32;
mul64To128( b, z, &term0, &term1 );
sub128( a0, a1, term0, term1, &rem0, &rem1 );
while ( ( (int64_t) rem0 ) < 0 ) {
z -= LIT64( 0x100000000 );
b1 = b<<32;
add128( rem0, rem1, b0, b1, &rem0, &rem1 );
}
rem0 = ( rem0<<32 ) | ( rem1>>32 );
z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : Ui6Div(rem0, b0);
return z;
}
/*----------------------------------------------------------------------------
| Returns an approximation to the square root of the 32-bit significand given
| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
| `aExp' (the least significant bit) is 1, the integer returned approximates
| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
| case, the approximation returned lies strictly within +/-2 of the exact
| value.
*----------------------------------------------------------------------------*/
LOCALFUNC uint32_t estimateSqrt32( int16_t aExp, uint32_t a )
{
static const uint16_t sqrtOddAdjustments[] = {
0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
};
static const uint16_t sqrtEvenAdjustments[] = {
0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
};
int8_t index;
uint32_t z;
index = ( a>>27 ) & 15;
if ( aExp & 1 ) {
z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ];
z = ( ( a / z )<<14 ) + ( z<<15 );
a >>= 1;
}
else {
z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ];
z = a / z + z;
z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
}
return ( (uint32_t) Ui6Div( ( ( (uint64_t) a )<<31 ), z ) ) + ( z>>1 );
}
/*----------------------------------------------------------------------------
| Returns the number of leading 0 bits before the most-significant 1 bit of
| `a'. If `a' is zero, 32 is returned.
*----------------------------------------------------------------------------*/
LOCALFUNC int8_t countLeadingZeros32( uint32_t a )
{
static const int8_t countLeadingZerosHigh[] = {
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
int8_t shiftCount;
shiftCount = 0;
if ( a < 0x10000 ) {
shiftCount += 16;
a <<= 16;
}
if ( a < 0x1000000 ) {
shiftCount += 8;
a <<= 8;
}
shiftCount += countLeadingZerosHigh[ a>>24 ];
return shiftCount;
}
/*----------------------------------------------------------------------------
| Returns the number of leading 0 bits before the most-significant 1 bit of
| `a'. If `a' is zero, 64 is returned.
*----------------------------------------------------------------------------*/
LOCALFUNC int8_t countLeadingZeros64( uint64_t a )
{
int8_t shiftCount;
shiftCount = 0;
if ( a < ( (uint64_t) 1 )<<32 ) {
shiftCount += 32;
}
else {
a >>= 32;
}
shiftCount += countLeadingZeros32( a );
return shiftCount;
}
/*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
| Otherwise, returns 0.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
{
return ( a0 == b0 ) && ( a1 == b1 );
}
/*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
| Otherwise, returns 0.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
{
return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
}
/*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
| returns 0.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
{
return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
}
/*----------------------------------------------------------------------------
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
| Otherwise, returns 0.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALINLINEFUNC flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
{
return ( a0 != b0 ) || ( a1 != b1 );
}
#endif
/* ----- end from original file "softfloat-macros" ----- */
/*----------------------------------------------------------------------------
| Functions and definitions to determine: (1) whether tininess for underflow
| is detected before or after rounding by default, (2) what (if anything)
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
| are propagated from function inputs to output. These details are target-
| specific.
*----------------------------------------------------------------------------*/
/* ----- from original file "softfloat-specialize" ----- */
/*============================================================================
This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
Arithmetic Package, Release 2b.
["original SoftFloat Copyright notice" went here, included near top of this file.]
=============================================================================*/
/*----------------------------------------------------------------------------
| Software IEC/IEEE floating-point underflow tininess-detection mode.
*----------------------------------------------------------------------------*/
enum {
float_tininess_after_rounding = 0,
float_tininess_before_rounding = 1
};
/*----------------------------------------------------------------------------
| Underflow tininess-detection mode, statically initialized to default value.
| (The declaration in `softfloat.h' must match the `int8_t' type here.)
*----------------------------------------------------------------------------*/
LOCALVAR int8_t float_detect_tininess = float_tininess_after_rounding;
/*----------------------------------------------------------------------------
| Routine to raise any or all of the software IEC/IEEE floating-point
| exception flags.
*----------------------------------------------------------------------------*/
/*----------------------------------------------------------------------------
| Raises the exceptions specified by `flags'. Floating-point traps can be
| defined here if desired. It is currently not possible for such a trap
| to substitute a result value. If traps are not implemented, this routine
| should be simply `float_exception_flags |= flags;'.
*----------------------------------------------------------------------------*/
LOCALFUNC void float_raise( int8_t flags )
{
float_exception_flags |= flags;
}
/*----------------------------------------------------------------------------
| Internal canonical NaN format.
*----------------------------------------------------------------------------*/
typedef struct {
flag sign;
uint64_t high, low;
} commonNaNT;
/*----------------------------------------------------------------------------
| The pattern for a default generated extended double-precision NaN. The
| `high' and `low' values hold the most- and least-significant bits,
| respectively.
*----------------------------------------------------------------------------*/
#define floatx80_default_nan_high 0xFFFF
#define floatx80_default_nan_low LIT64( 0xC000000000000000 )
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is a
| NaN; otherwise returns 0.
*----------------------------------------------------------------------------*/
LOCALFUNC flag floatx80_is_nan( floatx80 a )
{
return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (uint64_t) ( a.low<<1 );
}
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is a
| signaling NaN; otherwise returns 0.
*----------------------------------------------------------------------------*/
LOCALFUNC flag floatx80_is_signaling_nan( floatx80 a )
{
uint64_t aLow;
aLow = a.low & ~ LIT64( 0x4000000000000000 );
return
( ( a.high & 0x7FFF ) == 0x7FFF )
&& (uint64_t) ( aLow<<1 )
&& ( a.low == aLow );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
| invalid exception is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC commonNaNT floatx80ToCommonNaN( floatx80 a )
{
commonNaNT z;
if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
z.sign = a.high>>15;
z.low = 0;
z.high = a.low<<1;
return z;
}
/*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the extended
| double-precision floating-point format.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 commonNaNToFloatx80( commonNaNT a )
{
floatx80 z;
z.low = LIT64( 0xC000000000000000 ) | ( a.high>>1 );
z.high = ( ( (uint16_t) a.sign )<<15 ) | 0x7FFF;
return z;
}
/*----------------------------------------------------------------------------
| Takes two extended double-precision floating-point values `a' and `b', one
| of which is a NaN, and returns the appropriate NaN result. If either `a' or
| `b' is a signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b )
{
flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
aIsNaN = floatx80_is_nan( a );
aIsSignalingNaN = floatx80_is_signaling_nan( a );
bIsNaN = floatx80_is_nan( b );
bIsSignalingNaN = floatx80_is_signaling_nan( b );
a.low |= LIT64( 0xC000000000000000 );
b.low |= LIT64( 0xC000000000000000 );
if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
if ( aIsSignalingNaN ) {
if ( bIsSignalingNaN ) goto returnLargerSignificand;
return bIsNaN ? b : a;
}
else if ( aIsNaN ) {
if ( bIsSignalingNaN | ! bIsNaN ) return a;
returnLargerSignificand:
if ( a.low < b.low ) return b;
if ( b.low < a.low ) return a;
return ( a.high < b.high ) ? a : b;
}
else {
return b;
}
}
#ifdef FLOAT128
/*----------------------------------------------------------------------------
| The pattern for a default generated quadruple-precision NaN. The `high' and
| `low' values hold the most- and least-significant bits, respectively.
*----------------------------------------------------------------------------*/
#define float128_default_nan_high LIT64( 0xFFFF800000000000 )
#define float128_default_nan_low LIT64( 0x0000000000000000 )
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
| otherwise returns 0.
*----------------------------------------------------------------------------*/
LOCALFUNC flag float128_is_nan( float128 a )
{
return
( LIT64( 0xFFFE000000000000 ) <= (uint64_t) ( a.high<<1 ) )
&& ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
}
/*----------------------------------------------------------------------------
| Returns 1 if the quadruple-precision floating-point value `a' is a
| signaling NaN; otherwise returns 0.
*----------------------------------------------------------------------------*/
LOCALFUNC flag float128_is_signaling_nan( float128 a )
{
return
( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
&& ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point NaN
| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC commonNaNT float128ToCommonNaN( float128 a )
{
commonNaNT z;
if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
z.sign = a.high>>63;
shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
return z;
}
/*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the quadruple-
| precision floating-point format.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 commonNaNToFloat128( commonNaNT a )
{
float128 z;
shift128Right( a.high, a.low, 16, &z.high, &z.low );
z.high |= ( ( (uint64_t) a.sign )<<63 ) | LIT64( 0x7FFF800000000000 );
return z;
}
/*----------------------------------------------------------------------------
| Takes two quadruple-precision floating-point values `a' and `b', one of
| which is a NaN, and returns the appropriate NaN result. If either `a' or
| `b' is a signaling NaN, the invalid exception is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 propagateFloat128NaN( float128 a, float128 b )
{
flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
aIsNaN = float128_is_nan( a );
aIsSignalingNaN = float128_is_signaling_nan( a );
bIsNaN = float128_is_nan( b );
bIsSignalingNaN = float128_is_signaling_nan( b );
a.high |= LIT64( 0x0000800000000000 );
b.high |= LIT64( 0x0000800000000000 );
if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
if ( aIsSignalingNaN ) {
if ( bIsSignalingNaN ) goto returnLargerSignificand;
return bIsNaN ? b : a;
}
else if ( aIsNaN ) {
if ( bIsSignalingNaN | ! bIsNaN ) return a;
returnLargerSignificand:
if ( lt128( a.high<<1, a.low, b.high<<1, b.low ) ) return b;
if ( lt128( b.high<<1, b.low, a.high<<1, a.low ) ) return a;
return ( a.high < b.high ) ? a : b;
}
else {
return b;
}
}
#endif
/* ----- end from original file "softfloat-specialize" ----- */
/* ----- from original file "softfloat.c" ----- */
/*============================================================================
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
Package, Release 2b.
["original SoftFloat Copyright notice" went here, included near top of this file.]
=============================================================================*/
/*----------------------------------------------------------------------------
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
| and 7, and returns the properly rounded 32-bit integer corresponding to the
| input. If `zSign' is 1, the input is negated before being converted to an
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
| is simply rounded to an integer, with the inexact exception raised if the
| input cannot be represented exactly as an integer. However, if the fixed-
| point input is too large, the invalid exception is raised and the largest
| positive or negative integer is returned.
*----------------------------------------------------------------------------*/
LOCALFUNC int32_t roundAndPackInt32( flag zSign, uint64_t absZ )
{
int8_t roundingMode;
flag roundNearestEven;
int8_t roundIncrement, roundBits;
int32_t z;
roundingMode = float_rounding_mode;
roundNearestEven = ( roundingMode == float_round_nearest_even );
roundIncrement = 0x40;
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
roundIncrement = 0;
}
else {
roundIncrement = 0x7F;
if ( zSign ) {
if ( roundingMode == float_round_up ) roundIncrement = 0;
}
else {
if ( roundingMode == float_round_down ) roundIncrement = 0;
}
}
}
roundBits = absZ & 0x7F;
absZ = ( absZ + roundIncrement )>>7;
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
z = absZ;
if ( zSign ) z = - z;
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
float_raise( float_flag_invalid );
return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
}
if ( roundBits ) float_exception_flags |= float_flag_inexact;
return z;
}
/*----------------------------------------------------------------------------
| Returns the fraction bits of the extended double-precision floating-point
| value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC uint64_t extractFloatx80Frac( floatx80 a )
{
return a.low;
}
/*----------------------------------------------------------------------------
| Returns the exponent bits of the extended double-precision floating-point
| value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC int32_t extractFloatx80Exp( floatx80 a )
{
return a.high & 0x7FFF;
}
/*----------------------------------------------------------------------------
| Returns the sign bit of the extended double-precision floating-point value
| `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag extractFloatx80Sign( floatx80 a )
{
return a.high>>15;
}
/*----------------------------------------------------------------------------
| Normalizes the subnormal extended double-precision floating-point value
| represented by the denormalized significand `aSig'. The normalized exponent
| and significand are stored at the locations pointed to by `zExpPtr' and
| `zSigPtr', respectively.
*----------------------------------------------------------------------------*/
LOCALPROC
normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
{
int8_t shiftCount;
shiftCount = countLeadingZeros64( aSig );
*zSigPtr = aSig<<shiftCount;
*zExpPtr = 1 - shiftCount;
}
/*----------------------------------------------------------------------------
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
| extended double-precision floating-point value, returning the result.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
{
floatx80 z;
z.low = zSig;
z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
return z;
}
/*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
| and returns the proper extended double-precision floating-point value
| corresponding to the abstract input. Ordinarily, the abstract value is
| rounded and packed into the extended double-precision format, with the
| inexact exception raised if the abstract input cannot be represented
| exactly. However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned. If the abstract value is too small, the input value is rounded to
| a subnormal number, and the underflow and inexact exceptions are raised if
| the abstract input cannot be represented exactly as a subnormal extended
| double-precision floating-point number.
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
| number of bits as single or double precision, respectively. Otherwise, the
| result is rounded to the full precision of the extended double-precision
| format.
| The input significand must be normalized or smaller. If the input
| significand is not normalized, `zExp' must be 0; in that case, the result
| returned is a subnormal number, and it must not require rounding. The
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80
roundAndPackFloatx80(
int8_t roundingPrecision, flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1
)
{
int8_t roundingMode;
flag roundNearestEven, increment, isTiny;
int64_t roundIncrement, roundMask, roundBits;
roundingMode = float_rounding_mode;
roundNearestEven = ( roundingMode == float_round_nearest_even );
if ( roundingPrecision == 80 ) goto precision80;
if ( roundingPrecision == 64 ) {
roundIncrement = LIT64( 0x0000000000000400 );
roundMask = LIT64( 0x00000000000007FF );
}
else if ( roundingPrecision == 32 ) {
roundIncrement = LIT64( 0x0000008000000000 );
roundMask = LIT64( 0x000000FFFFFFFFFF );
}
else {
goto precision80;
}
zSig0 |= ( zSig1 != 0 );
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
roundIncrement = 0;
}
else {
roundIncrement = roundMask;
if ( zSign ) {
if ( roundingMode == float_round_up ) roundIncrement = 0;
}
else {
if ( roundingMode == float_round_down ) roundIncrement = 0;
}
}
}
roundBits = zSig0 & roundMask;
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
if ( ( 0x7FFE < zExp )
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
) {
goto overflow;
}
if ( zExp <= 0 ) {
isTiny =
( float_detect_tininess == float_tininess_before_rounding )
|| ( zExp < 0 )
|| ( zSig0 <= zSig0 + roundIncrement );
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
zExp = 0;
roundBits = zSig0 & roundMask;
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
if ( roundBits ) float_exception_flags |= float_flag_inexact;
zSig0 += roundIncrement;
if ( (int64_t) zSig0 < 0 ) zExp = 1;
roundIncrement = roundMask + 1;
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
roundMask |= roundIncrement;
}
zSig0 &= ~ roundMask;
return packFloatx80( zSign, zExp, zSig0 );
}
}
if ( roundBits ) float_exception_flags |= float_flag_inexact;
zSig0 += roundIncrement;
if ( zSig0 < roundIncrement ) {
++zExp;
zSig0 = LIT64( 0x8000000000000000 );
}
roundIncrement = roundMask + 1;
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
roundMask |= roundIncrement;
}
zSig0 &= ~ roundMask;
if ( zSig0 == 0 ) zExp = 0;
return packFloatx80( zSign, zExp, zSig0 );
precision80:
increment = ( (int64_t) zSig1 < 0 );
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
increment = 0;
}
else {
if ( zSign ) {
increment = ( roundingMode == float_round_down ) && zSig1;
}
else {
increment = ( roundingMode == float_round_up ) && zSig1;
}
}
}
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
if ( ( 0x7FFE < zExp )
|| ( ( zExp == 0x7FFE )
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
&& increment
)
) {
roundMask = 0;
overflow:
float_raise( float_flag_overflow | float_flag_inexact );
if ( ( roundingMode == float_round_to_zero )
|| ( zSign && ( roundingMode == float_round_up ) )
|| ( ! zSign && ( roundingMode == float_round_down ) )
) {
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
}
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( zExp <= 0 ) {
isTiny =
( float_detect_tininess == float_tininess_before_rounding )
|| ( zExp < 0 )
|| ! increment
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
zExp = 0;
if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
if ( zSig1 ) float_exception_flags |= float_flag_inexact;
if ( roundNearestEven ) {
increment = ( (int64_t) zSig1 < 0 );
}
else {
if ( zSign ) {
increment = ( roundingMode == float_round_down ) && zSig1;
}
else {
increment = ( roundingMode == float_round_up ) && zSig1;
}
}
if ( increment ) {
++zSig0;
zSig0 &=
~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
if ( (int64_t) zSig0 < 0 ) zExp = 1;
}
return packFloatx80( zSign, zExp, zSig0 );
}
}
if ( zSig1 ) float_exception_flags |= float_flag_inexact;
if ( increment ) {
++zSig0;
if ( zSig0 == 0 ) {
++zExp;
zSig0 = LIT64( 0x8000000000000000 );
}
else {
zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
}
}
else {
if ( zSig0 == 0 ) zExp = 0;
}
return packFloatx80( zSign, zExp, zSig0 );
}
/*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
| and returns the proper extended double-precision floating-point value
| corresponding to the abstract input. This routine is just like
| `roundAndPackFloatx80' except that the input significand does not have to be
| normalized.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80
normalizeRoundAndPackFloatx80(
int8_t roundingPrecision, flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1
)
{
int8_t shiftCount;
if ( zSig0 == 0 ) {
zSig0 = zSig1;
zSig1 = 0;
zExp -= 64;
}
shiftCount = countLeadingZeros64( zSig0 );
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
zExp -= shiftCount;
return
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
}
#ifdef FLOAT128
/*----------------------------------------------------------------------------
| Returns the least-significant 64 fraction bits of the quadruple-precision
| floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC uint64_t extractFloat128Frac1( float128 a )
{
return a.low;
}
/*----------------------------------------------------------------------------
| Returns the most-significant 48 fraction bits of the quadruple-precision
| floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC uint64_t extractFloat128Frac0( float128 a )
{
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
}
/*----------------------------------------------------------------------------
| Returns the exponent bits of the quadruple-precision floating-point value
| `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC int32_t extractFloat128Exp( float128 a )
{
return ( a.high>>48 ) & 0x7FFF;
}
/*----------------------------------------------------------------------------
| Returns the sign bit of the quadruple-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag extractFloat128Sign( float128 a )
{
return a.high>>63;
}
/*----------------------------------------------------------------------------
| Normalizes the subnormal quadruple-precision floating-point value
| represented by the denormalized significand formed by the concatenation of
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
| significand are stored at the location pointed to by `zSig0Ptr', and the
| least significant 64 bits of the normalized significand are stored at the
| location pointed to by `zSig1Ptr'.
*----------------------------------------------------------------------------*/
LOCALPROC
normalizeFloat128Subnormal(
uint64_t aSig0,
uint64_t aSig1,
int32_t *zExpPtr,
uint64_t *zSig0Ptr,
uint64_t *zSig1Ptr
)
{
int8_t shiftCount;
if ( aSig0 == 0 ) {
shiftCount = countLeadingZeros64( aSig1 ) - 15;
if ( shiftCount < 0 ) {
*zSig0Ptr = aSig1>>( - shiftCount );
*zSig1Ptr = aSig1<<( shiftCount & 63 );
}
else {
*zSig0Ptr = aSig1<<shiftCount;
*zSig1Ptr = 0;
}
*zExpPtr = - shiftCount - 63;
}
else {
shiftCount = countLeadingZeros64( aSig0 ) - 15;
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
*zExpPtr = 1 - shiftCount;
}
}
/*----------------------------------------------------------------------------
| Packs the sign `zSign', the exponent `zExp', and the significand formed
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
| floating-point value, returning the result. After being shifted into the
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
| added together to form the most significant 32 bits of the result. This
| means that any integer portion of `zSig0' will be added into the exponent.
| Since a properly normalized significand will have an integer portion equal
| to 1, the `zExp' input should be 1 less than the desired result exponent
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
| significand.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC float128
packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
{
float128 z;
z.low = zSig1;
z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
return z;
}
/*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and extended significand formed by the concatenation of `zSig0', `zSig1',
| and `zSig2', and returns the proper quadruple-precision floating-point value
| corresponding to the abstract input. Ordinarily, the abstract value is
| simply rounded and packed into the quadruple-precision format, with the
| inexact exception raised if the abstract input cannot be represented
| exactly. However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned. If the abstract value is too small, the input value is rounded to
| a subnormal number, and the underflow and inexact exceptions are raised if
| the abstract input cannot be represented exactly as a subnormal quadruple-
| precision floating-point number.
| The input significand must be normalized or smaller. If the input
| significand is not normalized, `zExp' must be 0; in that case, the result
| returned is a subnormal number, and it must not require rounding. In the
| usual case that the input significand is normalized, `zExp' must be 1 less
| than the ``true'' floating-point exponent. The handling of underflow and
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128
roundAndPackFloat128(
flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 )
{
int8_t roundingMode;
flag roundNearestEven, increment, isTiny;
roundingMode = float_rounding_mode;
roundNearestEven = ( roundingMode == float_round_nearest_even );
increment = ( (int64_t) zSig2 < 0 );
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
increment = 0;
}
else {
if ( zSign ) {
increment = ( roundingMode == float_round_down ) && zSig2;
}
else {
increment = ( roundingMode == float_round_up ) && zSig2;
}
}
}
if ( 0x7FFD <= (uint32_t) zExp ) {
if ( ( 0x7FFD < zExp )
|| ( ( zExp == 0x7FFD )
&& eq128(
LIT64( 0x0001FFFFFFFFFFFF ),
LIT64( 0xFFFFFFFFFFFFFFFF ),
zSig0,
zSig1
)
&& increment
)
) {
float_raise( float_flag_overflow | float_flag_inexact );
if ( ( roundingMode == float_round_to_zero )
|| ( zSign && ( roundingMode == float_round_up ) )
|| ( ! zSign && ( roundingMode == float_round_down ) )
) {
return
packFloat128(
zSign,
0x7FFE,
LIT64( 0x0000FFFFFFFFFFFF ),
LIT64( 0xFFFFFFFFFFFFFFFF )
);
}
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( zExp < 0 ) {
isTiny =
( float_detect_tininess == float_tininess_before_rounding )
|| ( zExp < -1 )
|| ! increment
|| lt128(
zSig0,
zSig1,
LIT64( 0x0001FFFFFFFFFFFF ),
LIT64( 0xFFFFFFFFFFFFFFFF )
);
shift128ExtraRightJamming(
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
zExp = 0;
if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
if ( roundNearestEven ) {
increment = ( (int64_t) zSig2 < 0 );
}
else {
if ( zSign ) {
increment = ( roundingMode == float_round_down ) && zSig2;
}
else {
increment = ( roundingMode == float_round_up ) && zSig2;
}
}
}
}
if ( zSig2 ) float_exception_flags |= float_flag_inexact;
if ( increment ) {
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
}
else {
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
}
return packFloat128( zSign, zExp, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and significand formed by the concatenation of `zSig0' and `zSig1', and
| returns the proper quadruple-precision floating-point value corresponding
| to the abstract input. This routine is just like `roundAndPackFloat128'
| except that the input significand has fewer bits and does not have to be
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
| point exponent.
*----------------------------------------------------------------------------*/
LOCALFUNC float128
normalizeRoundAndPackFloat128(
flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
{
int8_t shiftCount;
uint64_t zSig2;
if ( zSig0 == 0 ) {
zSig0 = zSig1;
zSig1 = 0;
zExp -= 64;
}
shiftCount = countLeadingZeros64( zSig0 ) - 15;
if ( 0 <= shiftCount ) {
zSig2 = 0;
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
}
else {
shift128ExtraRightJamming(
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
}
zExp -= shiftCount;
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
#endif
/*----------------------------------------------------------------------------
| Returns the result of converting the 32-bit two's complement integer `a'
| to the extended double-precision floating-point format. The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 int32_to_floatx80( int32_t a )
{
flag zSign;
uint32_t absA;
int8_t shiftCount;
uint64_t zSig;
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
zSign = ( a < 0 );
absA = zSign ? - a : a;
shiftCount = countLeadingZeros32( absA ) + 32;
zSig = absA;
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 32-bit two's complement integer format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic---which means in particular that the conversion
| is rounded according to the current rounding mode. If `a' is a NaN, the
| largest positive integer is returned. Otherwise, if the conversion
| overflows, the largest integer with the same sign as `a' is returned.
*----------------------------------------------------------------------------*/
LOCALFUNC int32_t floatx80_to_int32( floatx80 a )
{
flag aSign;
int32_t aExp, shiftCount;
uint64_t aSig;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
shiftCount = 0x4037 - aExp;
if ( shiftCount <= 0 ) shiftCount = 1;
shift64RightJamming( aSig, shiftCount, &aSig );
return roundAndPackInt32( aSign, aSig );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 32-bit two's complement integer format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic, except that the conversion is always rounded
| toward zero. If `a' is a NaN, the largest positive integer is returned.
| Otherwise, if the conversion overflows, the largest integer with the same
| sign as `a' is returned.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC int32_t floatx80_to_int32_round_to_zero( floatx80 a )
{
flag aSign;
int32_t aExp, shiftCount;
uint64_t aSig, savedASig;
int32_t z;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( 0x401E < aExp ) {
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
goto invalid;
}
else if ( aExp < 0x3FFF ) {
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
return 0;
}
shiftCount = 0x403E - aExp;
savedASig = aSig;
aSig >>= shiftCount;
z = aSig;
if ( aSign ) z = - z;
if ( ( z < 0 ) ^ aSign ) {
invalid:
float_raise( float_flag_invalid );
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
}
if ( ( aSig<<shiftCount ) != savedASig ) {
float_exception_flags |= float_flag_inexact;
}
return z;
}
#endif
#ifdef FLOAT128
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the quadruple-precision floating-point format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 floatx80_to_float128( floatx80 a )
{
flag aSign;
int16_t aExp;
uint64_t aSig, zSig0, zSig1;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
}
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
return packFloat128( aSign, aExp, zSig0, zSig1 );
}
#endif
/*----------------------------------------------------------------------------
| Rounds the extended double-precision floating-point value `a' to an integer,
| and returns the result as an extended quadruple-precision floating-point
| value. The operation is performed according to the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_round_to_int( floatx80 a )
{
flag aSign;
int32_t aExp;
uint64_t lastBitMask, roundBitsMask;
int8_t roundingMode;
floatx80 z;
aExp = extractFloatx80Exp( a );
if ( 0x403E <= aExp ) {
if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
return propagateFloatx80NaN( a, a );
}
return a;
}
if ( aExp < 0x3FFF ) {
if ( ( aExp == 0 )
&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
return a;
}
float_exception_flags |= float_flag_inexact;
aSign = extractFloatx80Sign( a );
switch ( float_rounding_mode ) {
case float_round_nearest_even:
if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
) {
return
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
}
break;
case float_round_down:
return
aSign ?
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
: packFloatx80( 0, 0, 0 );
case float_round_up:
return
aSign ? packFloatx80( 1, 0, 0 )
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
}
return packFloatx80( aSign, 0, 0 );
}
lastBitMask = 1;
lastBitMask <<= 0x403E - aExp;
roundBitsMask = lastBitMask - 1;
z = a;
roundingMode = float_rounding_mode;
if ( roundingMode == float_round_nearest_even ) {
z.low += lastBitMask>>1;
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
}
else if ( roundingMode != float_round_to_zero ) {
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
z.low += roundBitsMask;
}
}
z.low &= ~ roundBitsMask;
if ( z.low == 0 ) {
++z.high;
z.low = LIT64( 0x8000000000000000 );
}
if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
return z;
}
/*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the extended double-
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
| negated before being returned. `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
{
int32_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig0, zSig1;
int32_t expDiff;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
bSig = extractFloatx80Frac( b );
bExp = extractFloatx80Exp( b );
expDiff = aExp - bExp;
if ( 0 < expDiff ) {
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
return a;
}
if ( bExp == 0 ) --expDiff;
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
zExp = aExp;
}
else if ( expDiff < 0 ) {
if ( bExp == 0x7FFF ) {
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) ++expDiff;
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
zExp = bExp;
}
else {
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
return propagateFloatx80NaN( a, b );
}
return a;
}
zSig1 = 0;
zSig0 = aSig + bSig;
if ( aExp == 0 ) {
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
goto roundAndPack;
}
zExp = aExp;
goto shiftRight1;
}
zSig0 = aSig + bSig;
if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
shiftRight1:
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
zSig0 |= LIT64( 0x8000000000000000 );
++zExp;
roundAndPack:
return
roundAndPackFloatx80(
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the extended
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
| difference is negated before being returned. `zSign' is ignored if the
| result is a NaN. The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
{
int32_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig0, zSig1;
int32_t expDiff;
floatx80 z;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
bSig = extractFloatx80Frac( b );
bExp = extractFloatx80Exp( b );
expDiff = aExp - bExp;
if ( 0 < expDiff ) goto aExpBigger;
if ( expDiff < 0 ) goto bExpBigger;
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
return propagateFloatx80NaN( a, b );
}
float_raise( float_flag_invalid );
z.low = floatx80_default_nan_low;
z.high = floatx80_default_nan_high;
return z;
}
if ( aExp == 0 ) {
aExp = 1;
bExp = 1;
}
zSig1 = 0;
if ( bSig < aSig ) goto aBigger;
if ( aSig < bSig ) goto bBigger;
return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
bExpBigger:
if ( bExp == 0x7FFF ) {
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) ++expDiff;
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
bBigger:
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
zExp = bExp;
zSign ^= 1;
goto normalizeRoundAndPack;
aExpBigger:
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
return a;
}
if ( bExp == 0 ) --expDiff;
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
aBigger:
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
zExp = aExp;
normalizeRoundAndPack:
return
normalizeRoundAndPackFloatx80(
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Returns the result of adding the extended double-precision floating-point
| values `a' and `b'. The operation is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_add( floatx80 a, floatx80 b )
{
flag aSign, bSign;
aSign = extractFloatx80Sign( a );
bSign = extractFloatx80Sign( b );
if ( aSign == bSign ) {
return addFloatx80Sigs( a, b, aSign );
}
else {
return subFloatx80Sigs( a, b, aSign );
}
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the extended double-precision floating-
| point values `a' and `b'. The operation is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_sub( floatx80 a, floatx80 b )
{
flag aSign, bSign;
aSign = extractFloatx80Sign( a );
bSign = extractFloatx80Sign( b );
if ( aSign == bSign ) {
return subFloatx80Sigs( a, b, aSign );
}
else {
return addFloatx80Sigs( a, b, aSign );
}
}
/*----------------------------------------------------------------------------
| Returns the result of multiplying the extended double-precision floating-
| point values `a' and `b'. The operation is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_mul( floatx80 a, floatx80 b )
{
flag aSign, bSign, zSign;
int32_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig0, zSig1;
floatx80 z;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
bSig = extractFloatx80Frac( b );
bExp = extractFloatx80Exp( b );
bSign = extractFloatx80Sign( b );
zSign = aSign ^ bSign;
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig<<1 )
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
return propagateFloatx80NaN( a, b );
}
if ( ( bExp | bSig ) == 0 ) goto invalid;
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( bExp == 0x7FFF ) {
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
if ( ( aExp | aSig ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = floatx80_default_nan_low;
z.high = floatx80_default_nan_high;
return z;
}
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) {
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
}
if ( bExp == 0 ) {
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
}
zExp = aExp + bExp - 0x3FFE;
mul64To128( aSig, bSig, &zSig0, &zSig1 );
if ( 0 < (int64_t) zSig0 ) {
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
--zExp;
}
return
roundAndPackFloatx80(
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Returns the result of dividing the extended double-precision floating-point
| value `a' by the corresponding value `b'. The operation is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_div( floatx80 a, floatx80 b )
{
flag aSign, bSign, zSign;
int32_t aExp, bExp, zExp;
uint64_t aSig, bSig, zSig0, zSig1;
uint64_t rem0, rem1, rem2, term0, term1, term2;
floatx80 z;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
bSig = extractFloatx80Frac( b );
bExp = extractFloatx80Exp( b );
bSign = extractFloatx80Sign( b );
zSign = aSign ^ bSign;
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
if ( bExp == 0x7FFF ) {
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
goto invalid;
}
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( bExp == 0x7FFF ) {
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
return packFloatx80( zSign, 0, 0 );
}
if ( bExp == 0 ) {
if ( bSig == 0 ) {
if ( ( aExp | aSig ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = floatx80_default_nan_low;
z.high = floatx80_default_nan_high;
return z;
}
float_raise( float_flag_divbyzero );
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
}
if ( aExp == 0 ) {
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
}
if ( aSig == 0 ) {
/*
added by pcp. this invalid input seems to
cause hang in estimateDiv128To64. should
validate inputs generally.
*/
return packFloatx80( zSign, 0, 0 );
}
if ( (int64_t) bSig >= 0 ) {
/*
added by pcp. another check.
invalid input, most significant bit should be set?
*/
return packFloatx80( zSign, 0, 0 );
}
zExp = aExp - bExp + 0x3FFE;
rem1 = 0;
if ( bSig <= aSig ) {
shift128Right( aSig, 0, 1, &aSig, &rem1 );
++zExp;
}
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
mul64To128( bSig, zSig0, &term0, &term1 );
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
while ( (int64_t) rem0 < 0 ) {
--zSig0;
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
}
zSig1 = estimateDiv128To64( rem1, 0, bSig );
if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
mul64To128( bSig, zSig1, &term1, &term2 );
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
while ( (int64_t) rem1 < 0 ) {
--zSig1;
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
}
zSig1 |= ( ( rem1 | rem2 ) != 0 );
}
return
roundAndPackFloatx80(
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Returns the remainder of the extended double-precision floating-point value
| `a' with respect to the corresponding value `b'. The operation is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_rem( floatx80 a, floatx80 b )
{
flag aSign, /* bSign, */ zSign;
int32_t aExp, bExp, expDiff;
uint64_t aSig0, aSig1, bSig;
uint64_t q, term0, term1, alternateASig0, alternateASig1;
floatx80 z;
aSig0 = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
bSig = extractFloatx80Frac( b );
bExp = extractFloatx80Exp( b );
/*
not used. should it be?
bSign = extractFloatx80Sign( b );
*/
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig0<<1 )
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
return propagateFloatx80NaN( a, b );
}
goto invalid;
}
if ( bExp == 0x7FFF ) {
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
return a;
}
if ( bExp == 0 ) {
if ( bSig == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = floatx80_default_nan_low;
z.high = floatx80_default_nan_high;
return z;
}
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
}
if ( aExp == 0 ) {
if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
}
bSig |= LIT64( 0x8000000000000000 );
zSign = aSign;
expDiff = aExp - bExp;
aSig1 = 0;
if ( expDiff < 0 ) {
if ( expDiff < -1 ) return a;
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
expDiff = 0;
}
q = ( bSig <= aSig0 );
if ( q ) aSig0 -= bSig;
expDiff -= 64;
while ( 0 < expDiff ) {
q = estimateDiv128To64( aSig0, aSig1, bSig );
q = ( 2 < q ) ? q - 2 : 0;
mul64To128( bSig, q, &term0, &term1 );
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
expDiff -= 62;
}
expDiff += 64;
if ( 0 < expDiff ) {
q = estimateDiv128To64( aSig0, aSig1, bSig );
q = ( 2 < q ) ? q - 2 : 0;
q >>= 64 - expDiff;
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
while ( le128( term0, term1, aSig0, aSig1 ) ) {
++q;
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
}
}
else {
term1 = 0;
term0 = bSig;
}
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
&& ( q & 1 ) )
) {
aSig0 = alternateASig0;
aSig1 = alternateASig1;
zSign = ! zSign;
}
return
normalizeRoundAndPackFloatx80(
80, zSign, bExp + expDiff, aSig0, aSig1 );
}
/*----------------------------------------------------------------------------
| Returns the square root of the extended double-precision floating-point
| value `a'. The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_sqrt( floatx80 a )
{
flag aSign;
int32_t aExp, zExp;
uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
floatx80 z;
aSig0 = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
if ( ! aSign ) return a;
goto invalid;
}
if ( aSign ) {
if ( ( aExp | aSig0 ) == 0 ) return a;
invalid:
float_raise( float_flag_invalid );
z.low = floatx80_default_nan_low;
z.high = floatx80_default_nan_high;
return z;
}
if ( aExp == 0 ) {
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
}
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
doubleZSig0 = zSig0<<1;
mul64To128( zSig0, zSig0, &term0, &term1 );
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
while ( (int64_t) rem0 < 0 ) {
--zSig0;
doubleZSig0 -= 2;
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
}
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
if ( zSig1 == 0 ) zSig1 = 1;
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
mul64To128( zSig1, zSig1, &term2, &term3 );
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
while ( (int64_t) rem1 < 0 ) {
--zSig1;
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
term3 |= 1;
term2 |= doubleZSig0;
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
}
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
}
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
zSig0 |= doubleZSig0;
return
roundAndPackFloatx80(
floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| equal to the corresponding value `b', and 0 otherwise. The comparison is
| performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC flag floatx80_eq( floatx80 a, floatx80 b )
{
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
if ( floatx80_is_signaling_nan( a )
|| floatx80_is_signaling_nan( b ) ) {
float_raise( float_flag_invalid );
}
return 0;
}
return
( a.low == b.low )
&& ( ( a.high == b.high )
|| ( ( a.low == 0 )
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
);
}
#endif
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| less than or equal to the corresponding value `b', and 0 otherwise. The
| comparison is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC flag floatx80_le( floatx80 a, floatx80 b )
{
flag aSign, bSign;
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
float_raise( float_flag_invalid );
return 0;
}
aSign = extractFloatx80Sign( a );
bSign = extractFloatx80Sign( b );
if ( aSign != bSign ) {
return
aSign
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
== 0 );
}
return
aSign ? le128( b.high, b.low, a.high, a.low )
: le128( a.high, a.low, b.high, b.low );
}
#endif
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is
| less than the corresponding value `b', and 0 otherwise. The comparison
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC flag floatx80_lt( floatx80 a, floatx80 b )
{
flag aSign, bSign;
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
float_raise( float_flag_invalid );
return 0;
}
aSign = extractFloatx80Sign( a );
bSign = extractFloatx80Sign( b );
if ( aSign != bSign ) {
return
aSign
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
!= 0 );
}
return
aSign ? lt128( b.high, b.low, a.high, a.low )
: lt128( a.high, a.low, b.high, b.low );
}
#endif
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is equal
| to the corresponding value `b', and 0 otherwise. The invalid exception is
| raised if either operand is a NaN. Otherwise, the comparison is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC flag floatx80_eq_signaling( floatx80 a, floatx80 b )
{
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
float_raise( float_flag_invalid );
return 0;
}
return
( a.low == b.low )
&& ( ( a.high == b.high )
|| ( ( a.low == 0 )
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
);
}
#endif
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is less
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
| do not cause an exception. Otherwise, the comparison is performed according
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC flag floatx80_le_quiet( floatx80 a, floatx80 b )
{
flag aSign, bSign;
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
if ( floatx80_is_signaling_nan( a )
|| floatx80_is_signaling_nan( b ) ) {
float_raise( float_flag_invalid );
}
return 0;
}
aSign = extractFloatx80Sign( a );
bSign = extractFloatx80Sign( b );
if ( aSign != bSign ) {
return
aSign
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
== 0 );
}
return
aSign ? le128( b.high, b.low, a.high, a.low )
: le128( a.high, a.low, b.high, b.low );
}
#endif
/*----------------------------------------------------------------------------
| Returns 1 if the extended double-precision floating-point value `a' is less
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
| an exception. Otherwise, the comparison is performed according to the
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC flag floatx80_lt_quiet( floatx80 a, floatx80 b )
{
flag aSign, bSign;
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
) {
if ( floatx80_is_signaling_nan( a )
|| floatx80_is_signaling_nan( b ) ) {
float_raise( float_flag_invalid );
}
return 0;
}
aSign = extractFloatx80Sign( a );
bSign = extractFloatx80Sign( b );
if ( aSign != bSign ) {
return
aSign
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
!= 0 );
}
return
aSign ? lt128( b.high, b.low, a.high, a.low )
: lt128( a.high, a.low, b.high, b.low );
}
#endif
#ifdef FLOAT128
/*----------------------------------------------------------------------------
| Returns the result of converting the quadruple-precision floating-point
| value `a' to the extended double-precision floating-point format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 float128_to_floatx80( float128 a )
{
flag aSign;
int32_t aExp;
uint64_t aSig0, aSig1;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) {
return commonNaNToFloatx80( float128ToCommonNaN( a ) );
}
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
else {
aSig0 |= LIT64( 0x0001000000000000 );
}
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
}
/*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the quadruple-precision
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
| before being returned. `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
{
int32_t aExp, bExp, zExp;
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
int32_t expDiff;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
expDiff = aExp - bExp;
if ( 0 < expDiff ) {
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig0 |= LIT64( 0x0001000000000000 );
}
shift128ExtraRightJamming(
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
zExp = aExp;
}
else if ( expDiff < 0 ) {
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig0 |= LIT64( 0x0001000000000000 );
}
shift128ExtraRightJamming(
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
zExp = bExp;
}
else {
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
return propagateFloat128NaN( a, b );
}
return a;
}
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
zSig2 = 0;
zSig0 |= LIT64( 0x0002000000000000 );
zExp = aExp;
goto shiftRight1;
}
aSig0 |= LIT64( 0x0001000000000000 );
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
--zExp;
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
++zExp;
shiftRight1:
shift128ExtraRightJamming(
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
roundAndPack:
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the quadruple-
| precision floating-point values `a' and `b'. If `zSign' is 1, the
| difference is negated before being returned. `zSign' is ignored if the
| result is a NaN. The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
{
int32_t aExp, bExp, zExp;
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
int32_t expDiff;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
expDiff = aExp - bExp;
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
if ( 0 < expDiff ) goto aExpBigger;
if ( expDiff < 0 ) goto bExpBigger;
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
return propagateFloat128NaN( a, b );
}
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
if ( aExp == 0 ) {
aExp = 1;
bExp = 1;
}
if ( bSig0 < aSig0 ) goto aBigger;
if ( aSig0 < bSig0 ) goto bBigger;
if ( bSig1 < aSig1 ) goto aBigger;
if ( aSig1 < bSig1 ) goto bBigger;
return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
bExpBigger:
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig0 |= LIT64( 0x4000000000000000 );
}
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
bSig0 |= LIT64( 0x4000000000000000 );
bBigger:
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
zExp = bExp;
zSign ^= 1;
goto normalizeRoundAndPack;
aExpBigger:
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig0 |= LIT64( 0x4000000000000000 );
}
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
aSig0 |= LIT64( 0x4000000000000000 );
aBigger:
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
zExp = aExp;
normalizeRoundAndPack:
--zExp;
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
}
/*----------------------------------------------------------------------------
| Returns the result of adding the quadruple-precision floating-point values
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 float128_add( float128 a, float128 b )
{
flag aSign, bSign;
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign == bSign ) {
return addFloat128Sigs( a, b, aSign );
}
else {
return subFloat128Sigs( a, b, aSign );
}
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the quadruple-precision floating-point
| values `a' and `b'. The operation is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 float128_sub( float128 a, float128 b )
{
flag aSign, bSign;
aSign = extractFloat128Sign( a );
bSign = extractFloat128Sign( b );
if ( aSign == bSign ) {
return subFloat128Sigs( a, b, aSign );
}
else {
return addFloat128Sigs( a, b, aSign );
}
}
/*----------------------------------------------------------------------------
| Returns the result of multiplying the quadruple-precision floating-point
| values `a' and `b'. The operation is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 float128_mul( float128 a, float128 b )
{
flag aSign, bSign, zSign;
int32_t aExp, bExp, zExp;
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
bSign = extractFloat128Sign( b );
zSign = aSign ^ bSign;
if ( aExp == 0x7FFF ) {
if ( ( aSig0 | aSig1 )
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
return propagateFloat128NaN( a, b );
}
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
if ( bExp == 0 ) {
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
}
zExp = aExp + bExp - 0x4000;
aSig0 |= LIT64( 0x0001000000000000 );
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
zSig2 |= ( zSig3 != 0 );
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
shift128ExtraRightJamming(
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
++zExp;
}
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
/*----------------------------------------------------------------------------
| Returns the result of dividing the quadruple-precision floating-point value
| `a' by the corresponding value `b'. The operation is performed according to
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float128 float128_div( float128 a, float128 b )
{
flag aSign, bSign, zSign;
int32_t aExp, bExp, zExp;
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
float128 z;
aSig1 = extractFloat128Frac1( a );
aSig0 = extractFloat128Frac0( a );
aExp = extractFloat128Exp( a );
aSign = extractFloat128Sign( a );
bSig1 = extractFloat128Frac1( b );
bSig0 = extractFloat128Frac0( b );
bExp = extractFloat128Exp( b );
bSign = extractFloat128Sign( b );
zSign = aSign ^ bSign;
if ( aExp == 0x7FFF ) {
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
goto invalid;
}
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
if ( bExp == 0x7FFF ) {
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
return packFloat128( zSign, 0, 0, 0 );
}
if ( bExp == 0 ) {
if ( ( bSig0 | bSig1 ) == 0 ) {
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
invalid:
float_raise( float_flag_invalid );
z.low = float128_default_nan_low;
z.high = float128_default_nan_high;
return z;
}
float_raise( float_flag_divbyzero );
return packFloat128( zSign, 0x7FFF, 0, 0 );
}
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
}
if ( aExp == 0 ) {
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
}
zExp = aExp - bExp + 0x3FFD;
shortShift128Left(
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
shortShift128Left(
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
++zExp;
}
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
while ( (int64_t) rem0 < 0 ) {
--zSig0;
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
}
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
while ( (int64_t) rem1 < 0 ) {
--zSig1;
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
}
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
}
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
}
#endif
typedef unsigned int float32;
/*----------------------------------------------------------------------------
| Normalizes the subnormal single-precision floating-point value represented
| by the denormalized significand `aSig'. The normalized exponent and
| significand are stored at the locations pointed to by `zExpPtr' and
| `zSigPtr', respectively.
*----------------------------------------------------------------------------*/
LOCALPROC
normalizeFloat32Subnormal( uint32_t aSig, int16_t *zExpPtr, uint32_t *zSigPtr )
{
int8_t shiftCount;
shiftCount = countLeadingZeros32( aSig ) - 8;
*zSigPtr = aSig<<shiftCount;
*zExpPtr = 1 - shiftCount;
}
/*----------------------------------------------------------------------------
| Returns the fraction bits of the single-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC uint32_t extractFloat32Frac( float32 a )
{
return a & 0x007FFFFF;
}
/*----------------------------------------------------------------------------
| Returns the exponent bits of the single-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC int16_t extractFloat32Exp( float32 a )
{
return ( a>>23 ) & 0xFF;
}
/*----------------------------------------------------------------------------
| Returns the sign bit of the single-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag extractFloat32Sign( float32 a )
{
return a>>31;
}
/*----------------------------------------------------------------------------
| Returns 1 if the single-precision floating-point value `a' is a signaling
| NaN; otherwise returns 0.
*----------------------------------------------------------------------------*/
LOCALFUNC flag float32_is_signaling_nan( float32 a )
{
return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point NaN
| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC commonNaNT float32ToCommonNaN( float32 a )
{
commonNaNT z;
if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
z.sign = a>>31;
z.low = 0;
z.high = ( (uint64_t) a )<<41;
return z;
}
/*----------------------------------------------------------------------------
| Returns the result of converting the single-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 float32_to_floatx80( float32 a )
{
flag aSign;
int16_t aExp;
uint32_t aSig;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
aSign = extractFloat32Sign( a );
if ( aExp == 0xFF ) {
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) {
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
}
aSig |= 0x00800000;
return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
}
/*----------------------------------------------------------------------------
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
| single-precision floating-point value, returning the result. After being
| shifted into the proper positions, the three fields are simply added
| together to form the result. This means that any integer portion of `zSig'
| will be added into the exponent. Since a properly normalized significand
| will have an integer portion equal to 1, the `zExp' input should be 1 less
| than the desired result exponent whenever `zSig' is a complete, normalized
| significand.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC float32 packFloat32( flag zSign, int16_t zExp, uint32_t zSig )
{
return ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig;
}
/*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and significand `zSig', and returns the proper single-precision floating-
| point value corresponding to the abstract input. Ordinarily, the abstract
| value is simply rounded and packed into the single-precision format, with
| the inexact exception raised if the abstract input cannot be represented
| exactly. However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned. If the abstract value is too small, the input value is rounded to
| a subnormal number, and the underflow and inexact exceptions are raised if
| the abstract input cannot be represented exactly as a subnormal single-
| precision floating-point number.
| The input significand `zSig' has its binary point between bits 30
| and 29, which is 7 bits to the left of the usual location. This shifted
| significand must be normalized or smaller. If `zSig' is not normalized,
| `zExp' must be 0; in that case, the result returned is a subnormal number,
| and it must not require rounding. In the usual case that `zSig' is
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
| The handling of underflow and overflow follows the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float32 roundAndPackFloat32( flag zSign, int16_t zExp, uint32_t zSig )
{
int8_t roundingMode;
flag roundNearestEven;
int8_t roundIncrement, roundBits;
flag isTiny;
roundingMode = float_rounding_mode;
roundNearestEven = ( roundingMode == float_round_nearest_even );
roundIncrement = 0x40;
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
roundIncrement = 0;
}
else {
roundIncrement = 0x7F;
if ( zSign ) {
if ( roundingMode == float_round_up ) roundIncrement = 0;
}
else {
if ( roundingMode == float_round_down ) roundIncrement = 0;
}
}
}
roundBits = zSig & 0x7F;
if ( 0xFD <= (uint16_t) zExp ) {
if ( ( 0xFD < zExp )
|| ( ( zExp == 0xFD )
&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
) {
float_raise( float_flag_overflow | float_flag_inexact );
return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
}
if ( zExp < 0 ) {
isTiny =
( float_detect_tininess == float_tininess_before_rounding )
|| ( zExp < -1 )
|| ( zSig + roundIncrement < 0x80000000 );
shift32RightJamming( zSig, - zExp, &zSig );
zExp = 0;
roundBits = zSig & 0x7F;
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
}
}
if ( roundBits ) float_exception_flags |= float_flag_inexact;
zSig = ( zSig + roundIncrement )>>7;
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
if ( zSig == 0 ) zExp = 0;
return packFloat32( zSign, zExp, zSig );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the single-
| precision floating-point format.
*----------------------------------------------------------------------------*/
LOCALFUNC float32 commonNaNToFloat32( commonNaNT a )
{
return ( ( (uint32_t) a.sign )<<31 ) | 0x7FC00000 | ( a.high>>41 );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the single-precision floating-point format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float32 floatx80_to_float32( floatx80 a )
{
flag aSign;
int32_t aExp;
uint64_t aSig;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig<<1 ) ) {
return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
}
return packFloat32( aSign, 0xFF, 0 );
}
shift64RightJamming( aSig, 33, &aSig );
if ( aExp || aSig ) aExp -= 0x3F81;
return roundAndPackFloat32( aSign, aExp, aSig );
}
typedef uint64_t float64;
/*----------------------------------------------------------------------------
| Returns the fraction bits of the double-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC uint64_t extractFloat64Frac( float64 a )
{
return a & LIT64( 0x000FFFFFFFFFFFFF );
}
/*----------------------------------------------------------------------------
| Returns the exponent bits of the double-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC int16_t extractFloat64Exp( float64 a )
{
return ( a>>52 ) & 0x7FF;
}
/*----------------------------------------------------------------------------
| Returns the sign bit of the double-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC flag extractFloat64Sign( float64 a )
{
return a>>63;
}
/*----------------------------------------------------------------------------
| Returns 1 if the double-precision floating-point value `a' is a signaling
| NaN; otherwise returns 0.
*----------------------------------------------------------------------------*/
LOCALFUNC flag float64_is_signaling_nan( float64 a )
{
return
( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
&& ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point NaN
| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
| exception is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC commonNaNT float64ToCommonNaN( float64 a )
{
commonNaNT z;
if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
z.sign = a>>63;
z.low = 0;
z.high = a<<12;
return z;
}
/*----------------------------------------------------------------------------
| Normalizes the subnormal double-precision floating-point value represented
| by the denormalized significand `aSig'. The normalized exponent and
| significand are stored at the locations pointed to by `zExpPtr' and
| `zSigPtr', respectively.
*----------------------------------------------------------------------------*/
LOCALPROC
normalizeFloat64Subnormal( uint64_t aSig, int16_t *zExpPtr, uint64_t *zSigPtr )
{
int8_t shiftCount;
shiftCount = countLeadingZeros64( aSig ) - 11;
*zSigPtr = aSig<<shiftCount;
*zExpPtr = 1 - shiftCount;
}
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
| Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 float64_to_floatx80( float64 a )
{
flag aSign;
int16_t aExp;
uint64_t aSig;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
aSign = extractFloat64Sign( a );
if ( aExp == 0x7FF ) {
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
}
if ( aExp == 0 ) {
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
}
return
packFloatx80(
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
}
/*----------------------------------------------------------------------------
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
| double-precision floating-point value, returning the result. After being
| shifted into the proper positions, the three fields are simply added
| together to form the result. This means that any integer portion of `zSig'
| will be added into the exponent. Since a properly normalized significand
| will have an integer portion equal to 1, the `zExp' input should be 1 less
| than the desired result exponent whenever `zSig' is a complete, normalized
| significand.
*----------------------------------------------------------------------------*/
LOCALINLINEFUNC float64 packFloat64( flag zSign, int16_t zExp, uint64_t zSig )
{
return ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig;
}
/*----------------------------------------------------------------------------
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
| and significand `zSig', and returns the proper double-precision floating-
| point value corresponding to the abstract input. Ordinarily, the abstract
| value is simply rounded and packed into the double-precision format, with
| the inexact exception raised if the abstract input cannot be represented
| exactly. However, if the abstract value is too large, the overflow and
| inexact exceptions are raised and an infinity or maximal finite value is
| returned. If the abstract value is too small, the input value is rounded
| to a subnormal number, and the underflow and inexact exceptions are raised
| if the abstract input cannot be represented exactly as a subnormal double-
| precision floating-point number.
| The input significand `zSig' has its binary point between bits 62
| and 61, which is 10 bits to the left of the usual location. This shifted
| significand must be normalized or smaller. If `zSig' is not normalized,
| `zExp' must be 0; in that case, the result returned is a subnormal number,
| and it must not require rounding. In the usual case that `zSig' is
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
| The handling of underflow and overflow follows the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float64 roundAndPackFloat64( flag zSign, int16_t zExp, uint64_t zSig )
{
int8_t roundingMode;
flag roundNearestEven;
int16_t roundIncrement, roundBits;
flag isTiny;
roundingMode = float_rounding_mode;
roundNearestEven = ( roundingMode == float_round_nearest_even );
roundIncrement = 0x200;
if ( ! roundNearestEven ) {
if ( roundingMode == float_round_to_zero ) {
roundIncrement = 0;
}
else {
roundIncrement = 0x3FF;
if ( zSign ) {
if ( roundingMode == float_round_up ) roundIncrement = 0;
}
else {
if ( roundingMode == float_round_down ) roundIncrement = 0;
}
}
}
roundBits = zSig & 0x3FF;
if ( 0x7FD <= (uint16_t) zExp ) {
if ( ( 0x7FD < zExp )
|| ( ( zExp == 0x7FD )
&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
) {
float_raise( float_flag_overflow | float_flag_inexact );
return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
}
if ( zExp < 0 ) {
isTiny =
( float_detect_tininess == float_tininess_before_rounding )
|| ( zExp < -1 )
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
shift64RightJamming( zSig, - zExp, &zSig );
zExp = 0;
roundBits = zSig & 0x3FF;
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
}
}
if ( roundBits ) float_exception_flags |= float_flag_inexact;
zSig = ( zSig + roundIncrement )>>10;
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
if ( zSig == 0 ) zExp = 0;
return packFloat64( zSign, zExp, zSig );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the canonical NaN `a' to the double-
| precision floating-point format.
*----------------------------------------------------------------------------*/
LOCALFUNC float64 commonNaNToFloat64( commonNaNT a )
{
return
( ( (uint64_t) a.sign )<<63 )
| LIT64( 0x7FF8000000000000 )
| ( a.high>>12 );
}
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the double-precision floating-point format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC float64 floatx80_to_float64( floatx80 a )
{
flag aSign;
int32_t aExp;
uint64_t aSig, zSig;
aSig = extractFloatx80Frac( a );
aExp = extractFloatx80Exp( a );
aSign = extractFloatx80Sign( a );
if ( aExp == 0x7FFF ) {
if ( (uint64_t) ( aSig<<1 ) ) {
return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
}
return packFloat64( aSign, 0x7FF, 0 );
}
shift64RightJamming( aSig, 1, &zSig );
if ( aExp || aSig ) aExp -= 0x3C01;
return roundAndPackFloat64( aSign, aExp, zSig );
}
/* ----- end from original file "softfloat.c" ----- */
typedef int16_t Bit16s;
typedef int32_t Bit32s;
typedef int64_t Bit64s;
typedef uint32_t Bit32u;
typedef uint64_t Bit64u;
#define int16_indefinite 0x8000
/*----------------------------------------------------------------------------
| Takes extended double-precision floating-point NaN `a' and returns the
| appropriate NaN result. If `a' is a signaling NaN, the invalid exception
| is raised.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 propagateOneFloatx80NaN(floatx80 *a)
{
if (floatx80_is_signaling_nan(*a))
float_raise(float_flag_invalid);
a->low |= LIT64(0xC000000000000000);
return *a;
}
#define float_flag_denormal 0x02
#define floatx80_default_nan_exp 0xFFFF
#define floatx80_default_nan_fraction LIT64(0xC000000000000000)
#define packFloatx80m(zSign, zExp, zSig) {(zSig), ((zSign) << 15) + (zExp) }
/*----------------------------------------------------------------------------
| Packs two 64-bit precision integers into into the quadruple-precision
| floating-point value, returning the result.
*----------------------------------------------------------------------------*/
#define packFloat2x128m(zHi, zLo) {(zLo), (zHi)}
#define PACK_FLOAT_128(hi,lo) packFloat2x128m(LIT64(hi),LIT64(lo))
static const floatx80 floatx80_default_nan =
packFloatx80m(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
/*----------------------------------------------------------------------------
| Software IEC/IEEE floating-point ordering relations
*----------------------------------------------------------------------------*/
enum {
float_relation_less = -1,
float_relation_equal = 0,
float_relation_greater = 1,
float_relation_unordered = 2
};
#define EXP_BIAS 0x3FFF
/*----------------------------------------------------------------------------
| Returns the result of multiplying the extended double-precision floating-
| point value `a' and quadruple-precision floating point value `b'. The
| operation is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_mul128(floatx80 a, float128 b)
{
Bit32s aExp, bExp, zExp;
Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
int aSign, bSign, zSign;
// handle unsupported extended double-precision floating encodings
if (0 /* floatx80_is_unsupported(a) */)
{
invalid:
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
aSig = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
aSign = extractFloatx80Sign(a);
bSig0 = extractFloat128Frac0(b);
bSig1 = extractFloat128Frac1(b);
bExp = extractFloat128Exp(b);
bSign = extractFloat128Sign(b);
zSign = aSign ^ bSign;
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1)
|| ((bExp == 0x7FFF) && (bSig0 | bSig1)))
{
floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b));
return propagateFloatx80NaN(a, r);
}
if (bExp == 0) {
if ((bSig0 | bSig1) == 0) goto invalid;
float_raise(float_flag_denormal);
}
return packFloatx80(zSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (bExp == 0x7FFF) {
if (bSig0 | bSig1) {
floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b));
return propagateFloatx80NaN(a, r);
}
if (aExp == 0) {
if (aSig == 0) goto invalid;
float_raise(float_flag_denormal);
}
return packFloatx80(zSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (aExp == 0) {
if (aSig == 0) {
if ((bExp == 0) && (bSig0 | bSig1)) float_raise(float_flag_denormal);
return packFloatx80(zSign, 0, 0);
}
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
}
if (bExp == 0) {
if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
float_raise(float_flag_denormal);
normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
}
else bSig0 |= LIT64(0x0001000000000000);
zExp = aExp + bExp - 0x3FFE;
shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
if (0 < (Bit64s) zSig0) {
shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
--zExp;
}
return
roundAndPackFloatx80(floatx80_rounding_precision,
zSign, zExp, zSig0, zSig1);
}
/* ----- from original file "softfloatx80.h" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
/*----------------------------------------------------------------------------
| Software IEC/IEEE floating-point class.
*----------------------------------------------------------------------------*/
typedef enum {
float_zero,
float_NaN,
float_negative_inf,
float_positive_inf,
float_denormal,
float_normalized
} float_class_t;
/*-----------------------------------------------------------------------------
| Calculates the absolute value of the extended double-precision floating-point
| value `a'. The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
#if 0
inline floatx80 floatx80_abs(floatx80 *reg)
{
reg->high &= 0x7FFF;
return *reg;
}
#endif
/*-----------------------------------------------------------------------------
| Changes the sign of the extended double-precision floating-point value 'a'.
| The operation is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_chs(floatx80 *x)
{
x->high ^= 0x8000;
return *x;
}
/* ----- end from original file "softfloatx80.h" ----- */
/* ----- from original file "softfloatx80.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 16-bit two's complement integer format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic - which means in particular that the conversion
| is rounded according to the current rounding mode. If `a' is a NaN or the
| conversion overflows, the integer indefinite value is returned.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC Bit16s floatx80_to_int16(floatx80 a)
{
#if 0
if (floatx80_is_unsupported(a))
{
float_raise(float_flag_invalid);
return int16_indefinite;
}
#endif
Bit32s v32 = floatx80_to_int32(a);
if ((v32 > 32767) || (v32 < -32768)) {
float_exception_flags = float_flag_invalid; // throw way other flags
return int16_indefinite;
}
return (Bit16s) v32;
}
#endif
/*----------------------------------------------------------------------------
| Returns the result of converting the extended double-precision floating-
| point value `a' to the 16-bit two's complement integer format. The
| conversion is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic, except that the conversion is always rounded
| toward zero. If `a' is a NaN or the conversion overflows, the integer
| indefinite value is returned.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC Bit16s floatx80_to_int16_round_to_zero(floatx80 a)
{
#if 0
if (floatx80_is_unsupported(a))
{
float_raise(float_flag_invalid);
return int16_indefinite;
}
#endif
Bit32s v32 = floatx80_to_int32_round_to_zero(a);
if ((v32 > 32767) || (v32 < -32768)) {
float_exception_flags = float_flag_invalid; // throw way other flags
return int16_indefinite;
}
return (Bit16s) v32;
}
#endif
/*----------------------------------------------------------------------------
| Separate the source extended double-precision floating point value `a'
| into its exponent and significand, store the significant back to the
| 'a' and return the exponent. The operation performed is a superset of
| the IEC/IEEE recommended logb(x) function.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_extract(floatx80 *a)
{
Bit64u aSig = extractFloatx80Frac(*a);
Bit32s aExp = extractFloatx80Exp(*a);
int aSign = extractFloatx80Sign(*a);
#if 0
if (floatx80_is_unsupported(*a))
{
float_raise(float_flag_invalid);
*a = floatx80_default_nan;
return *a;
}
#endif
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1))
{
*a = propagateOneFloatx80NaN(a);
return *a;
}
return packFloatx80(0, 0x7FFF, LIT64(0x8000000000000000));
}
if (aExp == 0)
{
if (aSig == 0) {
float_raise(float_flag_divbyzero);
*a = packFloatx80(aSign, 0, 0);
return packFloatx80(1, 0x7FFF, LIT64(0x8000000000000000));
}
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
}
a->high = (aSign << 15) + 0x3FFF;
a->low = aSig;
return int32_to_floatx80(aExp - 0x3FFF);
}
/*----------------------------------------------------------------------------
| Scales extended double-precision floating-point value in operand `a' by
| value `b'. The function truncates the value in the second operand 'b' to
| an integral value and adds that value to the exponent of the operand 'a'.
| The operation performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_scale(floatx80 a, floatx80 b)
{
int shiftCount;
Bit32s scale;
// handle unsupported extended double-precision floating encodings
#if 0
if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
{
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
#endif
Bit64u aSig = extractFloatx80Frac(a);
Bit32s aExp = extractFloatx80Exp(a);
int aSign = extractFloatx80Sign(a);
Bit64u bSig = extractFloatx80Frac(b);
Bit32s bExp = extractFloatx80Exp(b);
int bSign = extractFloatx80Sign(b);
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
{
return propagateFloatx80NaN(a, b);
}
if ((bExp == 0x7FFF) && bSign) {
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
if (bSig && (bExp == 0)) float_raise(float_flag_denormal);
return a;
}
if (bExp == 0x7FFF) {
if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b);
if ((aExp | aSig) == 0) {
if (! bSign) {
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
return a;
}
if (aSig && (aExp == 0)) float_raise(float_flag_denormal);
if (bSign) return packFloatx80(aSign, 0, 0);
return packFloatx80(aSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (aExp == 0) {
if (bSig && (bExp == 0)) float_raise(float_flag_denormal);
if (aSig == 0) return a;
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
if (bExp < 0x3FFF)
return normalizeRoundAndPackFloatx80(80, aSign, aExp, aSig, 0);
}
if (bExp == 0) {
if (bSig == 0) return a;
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
}
if (bExp > 0x400E) {
/* generate appropriate overflow/underflow */
return roundAndPackFloatx80(80, aSign,
bSign ? -0x3FFF : 0x7FFF, aSig, 0);
}
if (bExp < 0x3FFF) return a;
shiftCount = 0x403E - bExp;
bSig >>= shiftCount;
scale = (Bit32s) bSig;
if (bSign) scale = -scale; /* -32768..32767 */
return
roundAndPackFloatx80(80, aSign, aExp+scale, aSig, 0);
}
/*----------------------------------------------------------------------------
| Determine extended-precision floating-point number class.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC float_class_t floatx80_class(floatx80 a)
{
Bit32s aExp = extractFloatx80Exp(a);
Bit64u aSig = extractFloatx80Frac(a);
if(aExp == 0) {
if (aSig == 0)
return float_zero;
/* denormal or pseudo-denormal */
return float_denormal;
}
/* valid numbers have the MS bit set */
if (!(aSig & LIT64(0x8000000000000000)))
return float_NaN; /* report unsupported as NaNs */
if(aExp == 0x7fff) {
int aSign = extractFloatx80Sign(a);
if (((Bit64u) (aSig<< 1)) == 0)
return (aSign) ? float_negative_inf : float_positive_inf;
return float_NaN;
}
return float_normalized;
}
#endif
/*----------------------------------------------------------------------------
| Compare between two extended precision floating point numbers. Returns
| 'float_relation_equal' if the operands are equal, 'float_relation_less' if
| the value 'a' is less than the corresponding value `b',
| 'float_relation_greater' if the value 'a' is greater than the corresponding
| value `b', or 'float_relation_unordered' otherwise.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC int floatx80_compare(floatx80 a, floatx80 b)
{
int aSign;
int bSign;
Bit64u aSig;
Bit32s aExp;
Bit64u bSig;
Bit32s bExp;
int less_than;
float_class_t aClass = floatx80_class(a);
float_class_t bClass = floatx80_class(b);
if (aClass == float_NaN || bClass == float_NaN)
{
float_raise(float_flag_invalid);
return float_relation_unordered;
}
if (aClass == float_denormal || bClass == float_denormal)
{
float_raise(float_flag_denormal);
}
aSign = extractFloatx80Sign(a);
bSign = extractFloatx80Sign(b);
if (aClass == float_zero) {
if (bClass == float_zero) return float_relation_equal;
return bSign ? float_relation_greater : float_relation_less;
}
if (bClass == float_zero || aSign != bSign) {
return aSign ? float_relation_less : float_relation_greater;
}
aSig = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
bSig = extractFloatx80Frac(b);
bExp = extractFloatx80Exp(b);
if (aClass == float_denormal)
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
if (bClass == float_denormal)
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
if (aExp == bExp && aSig == bSig)
return float_relation_equal;
less_than =
aSign ? ((bExp < aExp) || ((bExp == aExp) && (bSig < aSig)))
: ((aExp < bExp) || ((aExp == bExp) && (aSig < bSig)));
if (less_than) return float_relation_less;
return float_relation_greater;
}
#endif
/*----------------------------------------------------------------------------
| Compare between two extended precision floating point numbers. Returns
| 'float_relation_equal' if the operands are equal, 'float_relation_less' if
| the value 'a' is less than the corresponding value `b',
| 'float_relation_greater' if the value 'a' is greater than the corresponding
| value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
| an exception.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC int floatx80_compare_quiet(floatx80 a, floatx80 b)
{
int aSign;
int bSign;
Bit64u aSig;
Bit32s aExp;
Bit64u bSig;
Bit32s bExp;
int less_than;
float_class_t aClass = floatx80_class(a);
float_class_t bClass = floatx80_class(b);
if (aClass == float_NaN || bClass == float_NaN)
{
#if 0
if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
float_raise(float_flag_invalid);
#endif
if (floatx80_is_signaling_nan(a) || floatx80_is_signaling_nan(b))
float_raise(float_flag_invalid);
return float_relation_unordered;
}
if (aClass == float_denormal || bClass == float_denormal)
{
float_raise(float_flag_denormal);
}
aSign = extractFloatx80Sign(a);
bSign = extractFloatx80Sign(b);
if (aClass == float_zero) {
if (bClass == float_zero) return float_relation_equal;
return bSign ? float_relation_greater : float_relation_less;
}
if (bClass == float_zero || aSign != bSign) {
return aSign ? float_relation_less : float_relation_greater;
}
aSig = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
bSig = extractFloatx80Frac(b);
bExp = extractFloatx80Exp(b);
if (aClass == float_denormal)
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
if (bClass == float_denormal)
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
if (aExp == bExp && aSig == bSig)
return float_relation_equal;
less_than =
aSign ? ((bExp < aExp) || ((bExp == aExp) && (bSig < aSig)))
: ((aExp < bExp) || ((aExp == bExp) && (aSig < bSig)));
if (less_than) return float_relation_less;
return float_relation_greater;
}
#endif
/* ----- end from original file "softfloatx80.cc" ----- */
/* ----- from original file "fprem.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
#define USE_estimateDiv128To64
/* executes single exponent reduction cycle */
LOCALFUNC Bit64u remainder_kernel(Bit64u aSig0, Bit64u bSig, int expDiff, Bit64u *zSig0, Bit64u *zSig1)
{
Bit64u term0, term1;
Bit64u aSig1 = 0;
Bit64u q;
shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
q = estimateDiv128To64(aSig1, aSig0, bSig);
mul64To128(bSig, q, &term0, &term1);
sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
while ((Bit64s)(*zSig1) < 0) {
--q;
add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
}
return q;
}
LOCALFUNC floatx80 do_fprem(floatx80 a, floatx80 b, Bit64u *q, int rounding_mode)
{
Bit32s aExp, bExp, zExp, expDiff;
Bit64u aSig0, aSig1, bSig;
int aSign;
*q = 0;
#if 0
// handle unsupported extended double-precision floating encodings
if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
{
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
#endif
aSig0 = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
aSign = extractFloatx80Sign(a);
bSig = extractFloatx80Frac(b);
bExp = extractFloatx80Exp(b);
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig0<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1))) {
return propagateFloatx80NaN(a, b);
}
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
if (bExp == 0x7FFF) {
if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b);
if (aExp == 0 && aSig0) {
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
return (a.low & LIT64(0x8000000000000000)) ?
packFloatx80(aSign, aExp, aSig0) : a;
}
return a;
}
if (bExp == 0) {
if (bSig == 0) {
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
}
if (aExp == 0) {
if (aSig0 == 0) return a;
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
}
expDiff = aExp - bExp;
aSig1 = 0;
if (expDiff >= 64) {
int n = (expDiff & 0x1f) | 0x20;
remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
zExp = aExp - n;
*q = (Bit64u) -1;
}
else {
zExp = bExp;
if (expDiff < 0) {
if (expDiff < -1)
return (a.low & LIT64(0x8000000000000000)) ?
packFloatx80(aSign, aExp, aSig0) : a;
shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
expDiff = 0;
}
if (expDiff > 0) {
*q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
}
else {
if (bSig <= aSig0) {
aSig0 -= bSig;
*q = 1;
}
}
if (rounding_mode == float_round_nearest_even)
{
Bit64u term0, term1;
shift128Right(bSig, 0, 1, &term0, &term1);
if (! lt128(aSig0, aSig1, term0, term1))
{
int lt = lt128(term0, term1, aSig0, aSig1);
int eq = eq128(aSig0, aSig1, term0, term1);
if ((eq && (*q & 1)) || lt) {
aSign = !aSign;
++*q;
}
if (lt) sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
}
}
}
return normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1);
}
/*----------------------------------------------------------------------------
| Returns the remainder of the extended double-precision floating-point value
| `a' with respect to the corresponding value `b'. The operation is performed
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
#if cIncludeFPUUnused
LOCALFUNC floatx80 floatx80_ieee754_remainder(floatx80 a, floatx80 b, Bit64u *q)
{
return do_fprem(a, b, q, float_round_nearest_even);
}
#endif
/*----------------------------------------------------------------------------
| Returns the remainder of the extended double-precision floating-point value
| `a' with respect to the corresponding value `b'. Unlike previous function
| the function does not compute the remainder specified in the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic. This function operates
| differently from the previous function in the way that it rounds the
| quotient of 'a' divided by 'b' to an integer.
*----------------------------------------------------------------------------*/
LOCALFUNC floatx80 floatx80_remainder(floatx80 a, floatx80 b, Bit64u *q)
{
return do_fprem(a, b, q, float_round_to_zero);
}
/* ----- end from original file "fprem.cc" ----- */
/* ----- from original file "fpu_constant.h" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
// Pentium CPU uses only 68-bit precision M_PI approximation
// #define BETTER_THAN_PENTIUM
//////////////////////////////
// PI, PI/2, PI/4 constants
//////////////////////////////
#define FLOATX80_PI_EXP (0x4000)
// 128-bit PI fraction
#ifdef BETTER_THAN_PENTIUM
#define FLOAT_PI_HI (LIT64(0xc90fdaa22168c234))
#define FLOAT_PI_LO (LIT64(0xc4c6628b80dc1cd1))
#else
#define FLOAT_PI_HI (LIT64(0xc90fdaa22168c234))
#define FLOAT_PI_LO (LIT64(0xC000000000000000))
#endif
#define FLOATX80_PI2_EXP (0x3FFF)
#define FLOATX80_PI4_EXP (0x3FFE)
//////////////////////////////
// 3PI/4 constant
//////////////////////////////
#define FLOATX80_3PI4_EXP (0x4000)
// 128-bit 3PI/4 fraction
#ifdef BETTER_THAN_PENTIUM
#define FLOAT_3PI4_HI (LIT64(0x96cbe3f9990e91a7))
#define FLOAT_3PI4_LO (LIT64(0x9394c9e8a0a5159c))
#else
#define FLOAT_3PI4_HI (LIT64(0x96cbe3f9990e91a7))
#define FLOAT_3PI4_LO (LIT64(0x9000000000000000))
#endif
//////////////////////////////
// 1/LN2 constant
//////////////////////////////
#define FLOAT_LN2INV_EXP (0x3FFF)
// 128-bit 1/LN2 fraction
#ifdef BETTER_THAN_PENTIUM
#define FLOAT_LN2INV_HI (LIT64(0xb8aa3b295c17f0bb))
#define FLOAT_LN2INV_LO (LIT64(0xbe87fed0691d3e89))
#else
#define FLOAT_LN2INV_HI (LIT64(0xb8aa3b295c17f0bb))
#define FLOAT_LN2INV_LO (LIT64(0xC000000000000000))
#endif
/* ----- end from original file "fpu_constant.h" ----- */
/* ----- from original file "poly.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
// 2 3 4 n
// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
// 0 1 2 3 4 n
//
// -- 2k -- 2k+1
// p(x) = > C * x q(x) = > C * x
// -- 2k -- 2k+1
//
// f(x) ~ [ p(x) + x * q(x) ]
//
LOCALFUNC float128 EvalPoly(float128 x, float128 *arr, unsigned n)
{
float128 r2;
float128 x2 = float128_mul(x, x);
unsigned i;
#if 0
assert(n > 1);
#endif
float128 r1 = arr[--n];
i = n;
while(i >= 2) {
r1 = float128_mul(r1, x2);
i -= 2;
r1 = float128_add(r1, arr[i]);
}
if (i) r1 = float128_mul(r1, x);
r2 = arr[--n];
i = n;
while(i >= 2) {
r2 = float128_mul(r2, x2);
i -= 2;
r2 = float128_add(r2, arr[i]);
}
if (i) r2 = float128_mul(r2, x);
return float128_add(r1, r2);
}
// 2 4 6 8 2n
// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
// 0 1 2 3 4 n
//
// -- 4k -- 4k+2
// p(x) = > C * x q(x) = > C * x
// -- 2k -- 2k+1
//
// 2
// f(x) ~ [ p(x) + x * q(x) ]
//
LOCALFUNC float128 EvenPoly(float128 x, float128 *arr, unsigned n)
{
return EvalPoly(float128_mul(x, x), arr, n);
}
// 3 5 7 9 2n+1
// f(x) ~ (C * x) + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
// 0 1 2 3 4 n
// 2 4 6 8 2n
// = x * [ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
// 0 1 2 3 4 n
//
// -- 4k -- 4k+2
// p(x) = > C * x q(x) = > C * x
// -- 2k -- 2k+1
//
// 2
// f(x) ~ x * [ p(x) + x * q(x) ]
//
LOCALFUNC float128 OddPoly(float128 x, float128 *arr, unsigned n)
{
return float128_mul(x, EvenPoly(x, arr, n));
}
/* ----- end from original file "poly.cc" ----- */
/* ----- from original file "fyl2x.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
static const floatx80 floatx80_one =
packFloatx80m(0, 0x3fff, LIT64(0x8000000000000000));
static const float128 float128_one =
packFloat2x128m(LIT64(0x3fff000000000000), LIT64(0x0000000000000000));
static const float128 float128_two =
packFloat2x128m(LIT64(0x4000000000000000), LIT64(0x0000000000000000));
static const float128 float128_ln2inv2 =
packFloat2x128m(LIT64(0x400071547652b82f), LIT64(0xe1777d0ffda0d23a));
#define SQRT2_HALF_SIG LIT64(0xb504f333f9de6484)
#define L2_ARR_SIZE 9
static float128 ln_arr[L2_ARR_SIZE] =
{
PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */
PACK_FLOAT_128(0x3ffd555555555555, 0x5555555555555555), /* 3 */
PACK_FLOAT_128(0x3ffc999999999999, 0x999999999999999a), /* 5 */
PACK_FLOAT_128(0x3ffc249249249249, 0x2492492492492492), /* 7 */
PACK_FLOAT_128(0x3ffbc71c71c71c71, 0xc71c71c71c71c71c), /* 9 */
PACK_FLOAT_128(0x3ffb745d1745d174, 0x5d1745d1745d1746), /* 11 */
PACK_FLOAT_128(0x3ffb3b13b13b13b1, 0x3b13b13b13b13b14), /* 13 */
PACK_FLOAT_128(0x3ffb111111111111, 0x1111111111111111), /* 15 */
PACK_FLOAT_128(0x3ffae1e1e1e1e1e1, 0xe1e1e1e1e1e1e1e2) /* 17 */
};
LOCALFUNC float128 poly_ln(float128 x1)
{
/*
//
// 3 5 7 9 11 13 15
// 1+u u u u u u u u
// 1/2 ln --- ~ u + --- + --- + --- + --- + ---- + ---- + ---- =
// 1-u 3 5 7 9 11 13 15
//
// 2 4 6 8 10 12 14
// u u u u u u u
// = u * [ 1 + --- + --- + --- + --- + ---- + ---- + ---- ] =
// 3 5 7 9 11 13 15
//
// 3 3
// -- 4k -- 4k+2
// p(u) = > C * u q(u) = > C * u
// -- 2k -- 2k+1
// k=0 k=0
//
// 1+u 2
// 1/2 ln --- ~ u * [ p(u) + u * q(u) ]
// 1-u
//
*/
return OddPoly(x1, ln_arr, L2_ARR_SIZE);
}
/* required sqrt(2)/2 < x < sqrt(2) */
LOCALFUNC float128 poly_l2(float128 x)
{
/* using float128 for approximation */
float128 x_p1 = float128_add(x, float128_one);
float128 x_m1 = float128_sub(x, float128_one);
x = float128_div(x_m1, x_p1);
x = poly_ln(x);
x = float128_mul(x, float128_ln2inv2);
return x;
}
LOCALFUNC float128 poly_l2p1(float128 x)
{
/* using float128 for approximation */
float128 x_p2 = float128_add(x, float128_two);
x = float128_div(x, x_p2);
x = poly_ln(x);
x = float128_mul(x, float128_ln2inv2);
return x;
}
// =================================================
// FYL2X Compute y * log (x)
// 2
// =================================================
//
// Uses the following identities:
//
// 1. ----------------------------------------------------------
// ln(x)
// log (x) = -------, ln (x*y) = ln(x) + ln(y)
// 2 ln(2)
//
// 2. ----------------------------------------------------------
// 1+u x-1
// ln (x) = ln -----, when u = -----
// 1-u x+1
//
// 3. ----------------------------------------------------------
// 3 5 7 2n+1
// 1+u u u u u
// ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ]
// 1-u 3 5 7 2n+1
//
LOCALFUNC floatx80 fyl2x(floatx80 a, floatx80 b)
{
int aSign;
int bSign;
Bit64u aSig;
Bit32s aExp;
Bit64u bSig;
Bit32s bExp;
int zSign;
int ExpDiff;
Bit64u zSig0, zSig1;
float128 x;
// handle unsupported extended double-precision floating encodings
if (0 /* floatx80_is_unsupported(a) */) {
invalid:
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
aSig = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
aSign = extractFloatx80Sign(a);
bSig = extractFloatx80Frac(b);
bExp = extractFloatx80Exp(b);
bSign = extractFloatx80Sign(b);
zSign = bSign ^ 1;
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1)
|| ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
{
return propagateFloatx80NaN(a, b);
}
if (aSign) goto invalid;
else {
if (bExp == 0) {
if (bSig == 0) goto invalid;
float_raise(float_flag_denormal);
}
return packFloatx80(bSign, 0x7FFF, LIT64(0x8000000000000000));
}
}
if (bExp == 0x7FFF)
{
if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b);
if (aSign && (Bit64u)(aExp | aSig)) goto invalid;
if (aSig && (aExp == 0))
float_raise(float_flag_denormal);
if (aExp < 0x3FFF) {
return packFloatx80(zSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (aExp == 0x3FFF && ((Bit64u) (aSig<<1) == 0)) goto invalid;
return packFloatx80(bSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (aExp == 0) {
if (aSig == 0) {
if ((bExp | bSig) == 0) goto invalid;
float_raise(float_flag_divbyzero);
return packFloatx80(zSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (aSign) goto invalid;
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
}
if (aSign) goto invalid;
if (bExp == 0) {
if (bSig == 0) {
if (aExp < 0x3FFF) return packFloatx80(zSign, 0, 0);
return packFloatx80(bSign, 0, 0);
}
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
}
if (aExp == 0x3FFF && ((Bit64u) (aSig<<1) == 0))
return packFloatx80(bSign, 0, 0);
float_raise(float_flag_inexact);
ExpDiff = aExp - 0x3FFF;
aExp = 0;
if (aSig >= SQRT2_HALF_SIG) {
ExpDiff++;
aExp--;
}
/* ******************************** */
/* using float128 for approximation */
/* ******************************** */
shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
x = packFloat128(0, aExp+0x3FFF, zSig0, zSig1);
x = poly_l2(x);
x = float128_add(x, floatx80_to_float128(int32_to_floatx80(ExpDiff)));
return floatx80_mul128(b, x);
}
// =================================================
// FYL2XP1 Compute y * log (x + 1)
// 2
// =================================================
//
// Uses the following identities:
//
// 1. ----------------------------------------------------------
// ln(x)
// log (x) = -------
// 2 ln(2)
//
// 2. ----------------------------------------------------------
// 1+u x
// ln (x+1) = ln -----, when u = -----
// 1-u x+2
//
// 3. ----------------------------------------------------------
// 3 5 7 2n+1
// 1+u u u u u
// ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ]
// 1-u 3 5 7 2n+1
//
LOCALFUNC floatx80 fyl2xp1(floatx80 a, floatx80 b)
{
Bit32s aExp, bExp;
Bit64u aSig, bSig, zSig0, zSig1, zSig2;
int aSign, bSign;
int zSign;
float128 x;
// handle unsupported extended double-precision floating encodings
if (0 /* floatx80_is_unsupported(a) */) {
invalid:
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
aSig = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
aSign = extractFloatx80Sign(a);
bSig = extractFloatx80Frac(b);
bExp = extractFloatx80Exp(b);
bSign = extractFloatx80Sign(b);
zSign = aSign ^ bSign;
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1)
|| ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
{
return propagateFloatx80NaN(a, b);
}
if (aSign) goto invalid;
else {
if (bExp == 0) {
if (bSig == 0) goto invalid;
float_raise(float_flag_denormal);
}
return packFloatx80(bSign, 0x7FFF, LIT64(0x8000000000000000));
}
}
if (bExp == 0x7FFF)
{
if ((Bit64u) (bSig<<1))
return propagateFloatx80NaN(a, b);
if (aExp == 0) {
if (aSig == 0) goto invalid;
float_raise(float_flag_denormal);
}
return packFloatx80(zSign, 0x7FFF, LIT64(0x8000000000000000));
}
if (aExp == 0) {
if (aSig == 0) {
if (bSig && (bExp == 0)) float_raise(float_flag_denormal);
return packFloatx80(zSign, 0, 0);
}
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
}
if (bExp == 0) {
if (bSig == 0) return packFloatx80(zSign, 0, 0);
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
}
float_raise(float_flag_inexact);
if (aSign && aExp >= 0x3FFF)
return a;
if (aExp >= 0x3FFC) // big argument
{
return fyl2x(floatx80_add(a, floatx80_one), b);
}
// handle tiny argument
if (aExp < EXP_BIAS-70)
{
// first order approximation, return (a*b)/ln(2)
Bit32s zExp = aExp + FLOAT_LN2INV_EXP - 0x3FFE;
mul128By64To192(FLOAT_LN2INV_HI, FLOAT_LN2INV_LO, aSig, &zSig0, &zSig1, &zSig2);
if (0 < (Bit64s) zSig0) {
shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
--zExp;
}
zExp = zExp + bExp - 0x3FFE;
mul128By64To192(zSig0, zSig1, bSig, &zSig0, &zSig1, &zSig2);
if (0 < (Bit64s) zSig0) {
shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
--zExp;
}
return
roundAndPackFloatx80(80, aSign ^ bSign, zExp, zSig0, zSig1);
}
/* ******************************** */
/* using float128 for approximation */
/* ******************************** */
shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
x = packFloat128(aSign, aExp, zSig0, zSig1);
x = poly_l2p1(x);
return floatx80_mul128(b, x);
}
/* ----- end from original file "fyl2x.cc" ----- */
/* ----- from original file "f2xm1.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
static const floatx80 floatx80_negone = packFloatx80m(1, 0x3fff, LIT64(0x8000000000000000));
static const floatx80 floatx80_neghalf = packFloatx80m(1, 0x3ffe, LIT64(0x8000000000000000));
static const float128 float128_ln2 =
packFloat2x128m(LIT64(0x3ffe62e42fefa39e), LIT64(0xf35793c7673007e6));
#define LN2_SIG LIT64(0xb17217f7d1cf79ac)
#define EXP_ARR_SIZE 15
static float128 exp_arr[EXP_ARR_SIZE] =
{
PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */
PACK_FLOAT_128(0x3ffe000000000000, 0x0000000000000000), /* 2 */
PACK_FLOAT_128(0x3ffc555555555555, 0x5555555555555555), /* 3 */
PACK_FLOAT_128(0x3ffa555555555555, 0x5555555555555555), /* 4 */
PACK_FLOAT_128(0x3ff8111111111111, 0x1111111111111111), /* 5 */
PACK_FLOAT_128(0x3ff56c16c16c16c1, 0x6c16c16c16c16c17), /* 6 */
PACK_FLOAT_128(0x3ff2a01a01a01a01, 0xa01a01a01a01a01a), /* 7 */
PACK_FLOAT_128(0x3fefa01a01a01a01, 0xa01a01a01a01a01a), /* 8 */
PACK_FLOAT_128(0x3fec71de3a556c73, 0x38faac1c88e50017), /* 9 */
PACK_FLOAT_128(0x3fe927e4fb7789f5, 0xc72ef016d3ea6679), /* 10 */
PACK_FLOAT_128(0x3fe5ae64567f544e, 0x38fe747e4b837dc7), /* 11 */
PACK_FLOAT_128(0x3fe21eed8eff8d89, 0x7b544da987acfe85), /* 12 */
PACK_FLOAT_128(0x3fde6124613a86d0, 0x97ca38331d23af68), /* 13 */
PACK_FLOAT_128(0x3fda93974a8c07c9, 0xd20badf145dfa3e5), /* 14 */
PACK_FLOAT_128(0x3fd6ae7f3e733b81, 0xf11d8656b0ee8cb0) /* 15 */
};
/* required -1 < x < 1 */
LOCALFUNC float128 poly_exp(float128 x)
{
/*
// 2 3 4 5 6 7 8 9
// x x x x x x x x x
// e - 1 ~ x + --- + --- + --- + --- + --- + --- + --- + --- + ...
// 2! 3! 4! 5! 6! 7! 8! 9!
//
// 2 3 4 5 6 7 8
// x x x x x x x x
// = x [ 1 + --- + --- + --- + --- + --- + --- + --- + --- + ... ]
// 2! 3! 4! 5! 6! 7! 8! 9!
//
// 8 8
// -- 2k -- 2k+1
// p(x) = > C * x q(x) = > C * x
// -- 2k -- 2k+1
// k=0 k=0
//
// x
// e - 1 ~ x * [ p(x) + x * q(x) ]
//
*/
float128 t = EvalPoly(x, exp_arr, EXP_ARR_SIZE);
return float128_mul(t, x);
}
// =================================================
// x
// FX2P1 Compute 2 - 1
// =================================================
//
// Uses the following identities:
//
// 1. ----------------------------------------------------------
// x x*ln(2)
// 2 = e
//
// 2. ----------------------------------------------------------
// 2 3 4 5 n
// x x x x x x x
// e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
// 1! 2! 3! 4! 5! n!
//
LOCALFUNC floatx80 f2xm1(floatx80 a)
{
Bit64u zSig0, zSig1;
float128 x;
#if 0
// handle unsupported extended double-precision floating encodings
if (floatx80_is_unsupported(a))
{
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
#endif
Bit64u aSig = extractFloatx80Frac(a);
Bit32s aExp = extractFloatx80Exp(a);
int aSign = extractFloatx80Sign(a);
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1))
return propagateOneFloatx80NaN(&a);
return (aSign) ? floatx80_negone : a;
}
if (aExp == 0) {
if (aSig == 0) return a;
float_raise(float_flag_denormal | float_flag_inexact);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
tiny_argument:
mul64To128(aSig, LN2_SIG, &zSig0, &zSig1);
if (0 < (Bit64s) zSig0) {
shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
--aExp;
}
return
roundAndPackFloatx80(80, aSign, aExp, zSig0, zSig1);
}
float_raise(float_flag_inexact);
if (aExp < 0x3FFF)
{
if (aExp < EXP_BIAS-68)
goto tiny_argument;
/* ******************************** */
/* using float128 for approximation */
/* ******************************** */
x = floatx80_to_float128(a);
x = float128_mul(x, float128_ln2);
x = poly_exp(x);
return float128_to_floatx80(x);
}
else
{
if ((a.high == 0xBFFF) && (! (aSig<<1)))
return floatx80_neghalf;
return a;
}
}
/* ----- end from original file "f2xm1.cc" ----- */
/* ----- from original file "fsincos.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
#define USE_estimateDiv128To64
#if 0
static const floatx80 floatx80_one = packFloatx80m(0, 0x3fff, LIT64(0x8000000000000000));
#endif
/* reduce trigonometric function argument using 128-bit precision
M_PI approximation */
LOCALFUNC Bit64u argument_reduction_kernel(Bit64u aSig0, int Exp, Bit64u *zSig0, Bit64u *zSig1)
{
Bit64u term0, term1, term2;
Bit64u aSig1 = 0;
Bit64u q;
shortShift128Left(aSig1, aSig0, Exp, &aSig1, &aSig0);
q = estimateDiv128To64(aSig1, aSig0, FLOAT_PI_HI);
mul128By64To192(FLOAT_PI_HI, FLOAT_PI_LO, q, &term0, &term1, &term2);
sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
while ((Bit64s)(*zSig1) < 0) {
--q;
add192(*zSig1, *zSig0, term2, 0, FLOAT_PI_HI, FLOAT_PI_LO, zSig1, zSig0, &term2);
}
*zSig1 = term2;
return q;
}
LOCALFUNC int reduce_trig_arg(int expDiff, int *zSign, Bit64u *aSig0, Bit64u *aSig1)
{
Bit64u term0, term1, q = 0;
if (expDiff < 0) {
shift128Right(*aSig0, 0, 1, aSig0, aSig1);
expDiff = 0;
}
if (expDiff > 0) {
q = argument_reduction_kernel(*aSig0, expDiff, aSig0, aSig1);
}
else {
if (FLOAT_PI_HI <= *aSig0) {
*aSig0 -= FLOAT_PI_HI;
q = 1;
}
}
shift128Right(FLOAT_PI_HI, FLOAT_PI_LO, 1, &term0, &term1);
if (! lt128(*aSig0, *aSig1, term0, term1))
{
int lt = lt128(term0, term1, *aSig0, *aSig1);
int eq = eq128(*aSig0, *aSig1, term0, term1);
if ((eq && (q & 1)) || lt) {
*zSign = !*zSign;
++q;
}
if (lt) sub128(FLOAT_PI_HI, FLOAT_PI_LO, *aSig0, *aSig1, aSig0, aSig1);
}
return (int)(q & 3);
}
#define SIN_ARR_SIZE 9
#define COS_ARR_SIZE 9
static float128 sin_arr[SIN_ARR_SIZE] =
{
PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */
PACK_FLOAT_128(0xbffc555555555555, 0x5555555555555555), /* 3 */
PACK_FLOAT_128(0x3ff8111111111111, 0x1111111111111111), /* 5 */
PACK_FLOAT_128(0xbff2a01a01a01a01, 0xa01a01a01a01a01a), /* 7 */
PACK_FLOAT_128(0x3fec71de3a556c73, 0x38faac1c88e50017), /* 9 */
PACK_FLOAT_128(0xbfe5ae64567f544e, 0x38fe747e4b837dc7), /* 11 */
PACK_FLOAT_128(0x3fde6124613a86d0, 0x97ca38331d23af68), /* 13 */
PACK_FLOAT_128(0xbfd6ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 15 */
PACK_FLOAT_128(0x3fce952c77030ad4, 0xa6b2605197771b00) /* 17 */
};
static float128 cos_arr[COS_ARR_SIZE] =
{
PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 0 */
PACK_FLOAT_128(0xbffe000000000000, 0x0000000000000000), /* 2 */
PACK_FLOAT_128(0x3ffa555555555555, 0x5555555555555555), /* 4 */
PACK_FLOAT_128(0xbff56c16c16c16c1, 0x6c16c16c16c16c17), /* 6 */
PACK_FLOAT_128(0x3fefa01a01a01a01, 0xa01a01a01a01a01a), /* 8 */
PACK_FLOAT_128(0xbfe927e4fb7789f5, 0xc72ef016d3ea6679), /* 10 */
PACK_FLOAT_128(0x3fe21eed8eff8d89, 0x7b544da987acfe85), /* 12 */
PACK_FLOAT_128(0xbfda93974a8c07c9, 0xd20badf145dfa3e5), /* 14 */
PACK_FLOAT_128(0x3fd2ae7f3e733b81, 0xf11d8656b0ee8cb0) /* 16 */
};
/* 0 <= x <= pi/4 */
LOCALINLINEFUNC float128 poly_sin(float128 x)
{
// 3 5 7 9 11 13 15
// x x x x x x x
// sin (x) ~ x - --- + --- - --- + --- - ---- + ---- - ---- =
// 3! 5! 7! 9! 11! 13! 15!
//
// 2 4 6 8 10 12 14
// x x x x x x x
// = x * [ 1 - --- + --- - --- + --- - ---- + ---- - ---- ] =
// 3! 5! 7! 9! 11! 13! 15!
//
// 3 3
// -- 4k -- 4k+2
// p(x) = > C * x > 0 q(x) = > C * x < 0
// -- 2k -- 2k+1
// k=0 k=0
//
// 2
// sin(x) ~ x * [ p(x) + x * q(x) ]
//
return OddPoly(x, sin_arr, SIN_ARR_SIZE);
}
/* 0 <= x <= pi/4 */
LOCALINLINEFUNC float128 poly_cos(float128 x)
{
// 2 4 6 8 10 12 14
// x x x x x x x
// cos (x) ~ 1 - --- + --- - --- + --- - ---- + ---- - ----
// 2! 4! 6! 8! 10! 12! 14!
//
// 3 3
// -- 4k -- 4k+2
// p(x) = > C * x > 0 q(x) = > C * x < 0
// -- 2k -- 2k+1
// k=0 k=0
//
// 2
// cos(x) ~ [ p(x) + x * q(x) ]
//
return EvenPoly(x, cos_arr, COS_ARR_SIZE);
}
LOCALINLINEPROC sincos_invalid(floatx80 *sin_a, floatx80 *cos_a, floatx80 a)
{
if (sin_a) *sin_a = a;
if (cos_a) *cos_a = a;
}
LOCALINLINEPROC sincos_tiny_argument(floatx80 *sin_a, floatx80 *cos_a, floatx80 a)
{
if (sin_a) *sin_a = a;
if (cos_a) *cos_a = floatx80_one;
}
LOCALFUNC floatx80 sincos_approximation(int neg, float128 r, Bit64u quotient)
{
floatx80 result;
if (quotient & 0x1) {
r = poly_cos(r);
neg = 0;
} else {
r = poly_sin(r);
}
result = float128_to_floatx80(r);
if (quotient & 0x2)
neg = ! neg;
if (neg)
floatx80_chs(&result);
return result;
}
// =================================================
// FSINCOS Compute sin(x) and cos(x)
// =================================================
//
// Uses the following identities:
// ----------------------------------------------------------
//
// sin(-x) = -sin(x)
// cos(-x) = cos(x)
//
// sin(x+y) = sin(x)*cos(y)+cos(x)*sin(y)
// cos(x+y) = sin(x)*sin(y)+cos(x)*cos(y)
//
// sin(x+ pi/2) = cos(x)
// sin(x+ pi) = -sin(x)
// sin(x+3pi/2) = -cos(x)
// sin(x+2pi) = sin(x)
//
LOCALFUNC int fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a)
{
float128 r;
Bit64u aSig0, aSig1 = 0;
Bit32s aExp, zExp, expDiff;
int aSign, zSign;
int q = 0;
#if 0
// handle unsupported extended double-precision floating encodings
if (floatx80_is_unsupported(a))
{
goto invalid;
}
#endif
aSig0 = extractFloatx80Frac(a);
aExp = extractFloatx80Exp(a);
aSign = extractFloatx80Sign(a);
/* invalid argument */
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig0<<1)) {
sincos_invalid(sin_a, cos_a, propagateOneFloatx80NaN(&a));
return 0;
}
#if 0
invalid:
#endif
float_raise(float_flag_invalid);
sincos_invalid(sin_a, cos_a, floatx80_default_nan);
return 0;
}
if (aExp == 0) {
if (aSig0 == 0) {
sincos_tiny_argument(sin_a, cos_a, a);
return 0;
}
float_raise(float_flag_denormal);
/* handle pseudo denormals */
if (! (aSig0 & LIT64(0x8000000000000000)))
{
float_raise(float_flag_inexact);
if (sin_a)
float_raise(float_flag_underflow);
sincos_tiny_argument(sin_a, cos_a, a);
return 0;
}
normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
}
zSign = aSign;
zExp = EXP_BIAS;
expDiff = aExp - zExp;
/* argument is out-of-range */
if (expDiff >= 63)
return -1;
float_raise(float_flag_inexact);
if (expDiff < -1) { // doesn't require reduction
if (expDiff <= -68) {
a = packFloatx80(aSign, aExp, aSig0);
sincos_tiny_argument(sin_a, cos_a, a);
return 0;
}
zExp = aExp;
}
else {
q = reduce_trig_arg(expDiff, &zSign, &aSig0, &aSig1);
}
/* **************************** */
/* argument reduction completed */
/* **************************** */
/* using float128 for approximation */
r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1);
if (aSign) q = -q;
if (sin_a) *sin_a = sincos_approximation(zSign, r, q);
if (cos_a) *cos_a = sincos_approximation(zSign, r, q+1);
return 0;
}
// =================================================
// FPTAN Compute tan(x)
// =================================================
//
// Uses the following identities:
//
// 1. ----------------------------------------------------------
//
// sin(-x) = -sin(x)
// cos(-x) = cos(x)
//
// sin(x+y) = sin(x)*cos(y)+cos(x)*sin(y)
// cos(x+y) = sin(x)*sin(y)+cos(x)*cos(y)
//
// sin(x+ pi/2) = cos(x)
// sin(x+ pi) = -sin(x)
// sin(x+3pi/2) = -cos(x)
// sin(x+2pi) = sin(x)
//
// 2. ----------------------------------------------------------
//
// sin(x)
// tan(x) = ------
// cos(x)
//
LOCALFUNC int ftan(floatx80 *a)
{
float128 r;
float128 sin_r;
float128 cos_r;
Bit64u aSig0, aSig1 = 0;
Bit32s aExp, zExp, expDiff;
int aSign, zSign;
int q = 0;
#if 0
// handle unsupported extended double-precision floating encodings
if (floatx80_is_unsupported(*a))
{
goto invalid;
}
#endif
aSig0 = extractFloatx80Frac(*a);
aExp = extractFloatx80Exp(*a);
aSign = extractFloatx80Sign(*a);
/* invalid argument */
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig0<<1))
{
*a = propagateOneFloatx80NaN(a);
return 0;
}
#if 0
invalid:
#endif
float_raise(float_flag_invalid);
*a = floatx80_default_nan;
return 0;
}
if (aExp == 0) {
if (aSig0 == 0) return 0;
float_raise(float_flag_denormal);
/* handle pseudo denormals */
if (! (aSig0 & LIT64(0x8000000000000000)))
{
float_raise(float_flag_inexact | float_flag_underflow);
return 0;
}
normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
}
zSign = aSign;
zExp = EXP_BIAS;
expDiff = aExp - zExp;
/* argument is out-of-range */
if (expDiff >= 63)
return -1;
float_raise(float_flag_inexact);
if (expDiff < -1) { // doesn't require reduction
if (expDiff <= -68) {
*a = packFloatx80(aSign, aExp, aSig0);
return 0;
}
zExp = aExp;
}
else {
q = reduce_trig_arg(expDiff, &zSign, &aSig0, &aSig1);
}
/* **************************** */
/* argument reduction completed */
/* **************************** */
/* using float128 for approximation */
r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1);
sin_r = poly_sin(r);
cos_r = poly_cos(r);
if (q & 0x1) {
r = float128_div(cos_r, sin_r);
zSign = ! zSign;
} else {
r = float128_div(sin_r, cos_r);
}
*a = float128_to_floatx80(r);
if (zSign)
floatx80_chs(a);
return 0;
}
/* ----- end from original file "fsincos.cc" ----- */
/* ----- from original file "fpatan.cc" ----- */
/*
["original Stanislav Shwartsman Copyright notice" went here, included near top of this file.]
*/
#define FPATAN_ARR_SIZE 11
#if 0
static const float128 float128_one =
packFloat2x128m(LIT64(0x3fff000000000000), LIT64(0x0000000000000000));
#endif
static const float128 float128_sqrt3 =
packFloat2x128m(LIT64(0x3fffbb67ae8584ca), LIT64(0xa73b25742d7078b8));
static const floatx80 floatx80_pi =
packFloatx80m(0, 0x4000, LIT64(0xc90fdaa22168c235));
static const float128 float128_pi2 =
packFloat2x128m(LIT64(0x3fff921fb54442d1), LIT64(0x8469898CC5170416));
static const float128 float128_pi4 =
packFloat2x128m(LIT64(0x3ffe921fb54442d1), LIT64(0x8469898CC5170416));
static const float128 float128_pi6 =
packFloat2x128m(LIT64(0x3ffe0c152382d736), LIT64(0x58465BB32E0F580F));
static float128 atan_arr[FPATAN_ARR_SIZE] =
{
PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */
PACK_FLOAT_128(0xbffd555555555555, 0x5555555555555555), /* 3 */
PACK_FLOAT_128(0x3ffc999999999999, 0x999999999999999a), /* 5 */
PACK_FLOAT_128(0xbffc249249249249, 0x2492492492492492), /* 7 */
PACK_FLOAT_128(0x3ffbc71c71c71c71, 0xc71c71c71c71c71c), /* 9 */
PACK_FLOAT_128(0xbffb745d1745d174, 0x5d1745d1745d1746), /* 11 */
PACK_FLOAT_128(0x3ffb3b13b13b13b1, 0x3b13b13b13b13b14), /* 13 */
PACK_FLOAT_128(0xbffb111111111111, 0x1111111111111111), /* 15 */
PACK_FLOAT_128(0x3ffae1e1e1e1e1e1, 0xe1e1e1e1e1e1e1e2), /* 17 */
PACK_FLOAT_128(0xbffaaf286bca1af2, 0x86bca1af286bca1b), /* 19 */
PACK_FLOAT_128(0x3ffa861861861861, 0x8618618618618618) /* 21 */
};
/* |x| < 1/4 */
LOCALFUNC float128 poly_atan(float128 x1)
{
/*
// 3 5 7 9 11 13 15 17
// x x x x x x x x
// atan(x) ~ x - --- + --- - --- + --- - ---- + ---- - ---- + ----
// 3 5 7 9 11 13 15 17
//
// 2 4 6 8 10 12 14 16
// x x x x x x x x
// = x * [ 1 - --- + --- - --- + --- - ---- + ---- - ---- + ---- ]
// 3 5 7 9 11 13 15 17
//
// 5 5
// -- 4k -- 4k+2
// p(x) = > C * x q(x) = > C * x
// -- 2k -- 2k+1
// k=0 k=0
//
// 2
// atan(x) ~ x * [ p(x) + x * q(x) ]
//
*/
return OddPoly(x1, atan_arr, FPATAN_ARR_SIZE);
}
// =================================================
// FPATAN Compute y * log (x)
// 2
// =================================================
//
// Uses the following identities:
//
// 1. ----------------------------------------------------------
//
// atan(-x) = -atan(x)
//
// 2. ----------------------------------------------------------
//
// x + y
// atan(x) + atan(y) = atan -------, xy < 1
// 1-xy
//
// x + y
// atan(x) + atan(y) = atan ------- + PI, x > 0, xy > 1
// 1-xy
//
// x + y
// atan(x) + atan(y) = atan ------- - PI, x < 0, xy > 1
// 1-xy
//
// 3. ----------------------------------------------------------
//
// atan(x) = atan(INF) + atan(- 1/x)
//
// x-1
// atan(x) = PI/4 + atan( ----- )
// x+1
//
// x * sqrt(3) - 1
// atan(x) = PI/6 + atan( ----------------- )
// x + sqrt(3)
//
// 4. ----------------------------------------------------------
// 3 5 7 9 2n+1
// x x x x n x
// atan(x) = x - --- + --- - --- + --- - ... + (-1) ------ + ...
// 3 5 7 9 2n+1
//
LOCALFUNC floatx80 fpatan(floatx80 a, floatx80 b)
{
float128 a128;
float128 b128;
float128 x;
int swap, add_pi6, add_pi4;
Bit32s xExp;
floatx80 result;
int rSign;
// handle unsupported extended double-precision floating encodings
#if 0
if (floatx80_is_unsupported(a)) {
float_raise(float_flag_invalid);
return floatx80_default_nan;
}
#endif
Bit64u aSig = extractFloatx80Frac(a);
Bit32s aExp = extractFloatx80Exp(a);
int aSign = extractFloatx80Sign(a);
Bit64u bSig = extractFloatx80Frac(b);
Bit32s bExp = extractFloatx80Exp(b);
int bSign = extractFloatx80Sign(b);
int zSign = aSign ^ bSign;
if (bExp == 0x7FFF)
{
if ((Bit64u) (bSig<<1))
return propagateFloatx80NaN(a, b);
if (aExp == 0x7FFF) {
if ((Bit64u) (aSig<<1))
return propagateFloatx80NaN(a, b);
if (aSign) { /* return 3PI/4 */
return roundAndPackFloatx80(80, bSign,
FLOATX80_3PI4_EXP, FLOAT_3PI4_HI, FLOAT_3PI4_LO);
}
else { /* return PI/4 */
return roundAndPackFloatx80(80, bSign,
FLOATX80_PI4_EXP, FLOAT_PI_HI, FLOAT_PI_LO);
}
}
if (aSig && (aExp == 0))
float_raise(float_flag_denormal);
/* return PI/2 */
return roundAndPackFloatx80(80, bSign, FLOATX80_PI2_EXP, FLOAT_PI_HI, FLOAT_PI_LO);
}
if (aExp == 0x7FFF)
{
if ((Bit64u) (aSig<<1))
return propagateFloatx80NaN(a, b);
if (bSig && (bExp == 0))
float_raise(float_flag_denormal);
return_PI_or_ZERO:
if (aSign) { /* return PI */
return roundAndPackFloatx80(80, bSign, FLOATX80_PI_EXP, FLOAT_PI_HI, FLOAT_PI_LO);
} else { /* return 0 */
return packFloatx80(bSign, 0, 0);
}
}
if (bExp == 0)
{
if (bSig == 0) {
if (aSig && (aExp == 0)) float_raise(float_flag_denormal);
goto return_PI_or_ZERO;
}
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
}
if (aExp == 0)
{
if (aSig == 0) /* return PI/2 */
return roundAndPackFloatx80(80, bSign, FLOATX80_PI2_EXP, FLOAT_PI_HI, FLOAT_PI_LO);
float_raise(float_flag_denormal);
normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
}
float_raise(float_flag_inexact);
/* |a| = |b| ==> return PI/4 */
if (aSig == bSig && aExp == bExp)
return roundAndPackFloatx80(80, bSign, FLOATX80_PI4_EXP, FLOAT_PI_HI, FLOAT_PI_LO);
/* ******************************** */
/* using float128 for approximation */
/* ******************************** */
a128 = normalizeRoundAndPackFloat128(0, aExp-0x10, aSig, 0);
b128 = normalizeRoundAndPackFloat128(0, bExp-0x10, bSig, 0);
swap = 0;
add_pi6 = 0;
add_pi4 = 0;
if (aExp > bExp || (aExp == bExp && aSig > bSig))
{
x = float128_div(b128, a128);
}
else {
x = float128_div(a128, b128);
swap = 1;
}
xExp = extractFloat128Exp(x);
if (xExp <= EXP_BIAS-40)
goto approximation_completed;
if (x.high >= LIT64(0x3ffe800000000000)) // 3/4 < x < 1
{
/*
arctan(x) = arctan((x-1)/(x+1)) + pi/4
*/
float128 t1 = float128_sub(x, float128_one);
float128 t2 = float128_add(x, float128_one);
x = float128_div(t1, t2);
add_pi4 = 1;
}
else
{
/* argument correction */
if (xExp >= 0x3FFD) // 1/4 < x < 3/4
{
/*
arctan(x) = arctan((x*sqrt(3)-1)/(x+sqrt(3))) + pi/6
*/
float128 t1 = float128_mul(x, float128_sqrt3);
float128 t2 = float128_add(x, float128_sqrt3);
x = float128_sub(t1, float128_one);
x = float128_div(x, t2);
add_pi6 = 1;
}
}
x = poly_atan(x);
if (add_pi6) x = float128_add(x, float128_pi6);
if (add_pi4) x = float128_add(x, float128_pi4);
approximation_completed:
if (swap) x = float128_sub(float128_pi2, x);
result = float128_to_floatx80(x);
if (zSign) floatx80_chs(&result);
rSign = extractFloatx80Sign(result);
if (!bSign && rSign)
return floatx80_add(result, floatx80_pi);
if (bSign && !rSign)
return floatx80_sub(result, floatx80_pi);
return result;
}
/* ----- end from original file "fpatan.cc" ----- */
/* end soft float stuff */
typedef floatx80 myfpr;
LOCALPROC myfp_FromExtendedFormat(myfpr *r, uint16_t v2, uint32_t v1, uint32_t v0)
{
r->high = v2;
r->low = (((uint64_t)v1) << 32) | (v0 & 0xFFFFFFFF);
}
LOCALPROC myfp_ToExtendedFormat(myfpr *dd, uint16_t *v2, uint32_t *v1, uint32_t *v0)
{
*v0 = ((uint64_t) dd->low) & 0xFFFFFFFF;
*v1 = (((uint64_t) dd->low) >> 32) & 0xFFFFFFFF;
*v2 = dd->high;
}
LOCALPROC myfp_FromDoubleFormat(myfpr *r, uint32_t v1, uint32_t v0)
{
float64 t = (float64)((((uint64_t)v1) << 32) | (v0 & 0xFFFFFFFF));
*r = float64_to_floatx80(t);
}
LOCALPROC myfp_ToDoubleFormat(myfpr *dd, uint32_t *v1, uint32_t *v0)
{
float64 t = floatx80_to_float64(*dd);
*v0 = ((uint64_t) t) & 0xFFFFFFFF;
*v1 = (((uint64_t) t) >> 32) & 0xFFFFFFFF;
}
LOCALPROC myfp_FromSingleFormat(myfpr *r, uint32_t x)
{
*r = float32_to_floatx80(x);
}
LOCALFUNC uint32_t myfp_ToSingleFormat(myfpr *ff)
{
return floatx80_to_float32(*ff);
}
LOCALPROC myfp_FromLong(myfpr *r, uint32_t x)
{
*r = int32_to_floatx80( x );
}
LOCALFUNC uint32_t myfp_ToLong(myfpr *x)
{
return floatx80_to_int32( *x );
}
LOCALFUNC bool myfp_IsNan(myfpr *x)
{
return floatx80_is_nan(*x);
}
LOCALFUNC bool myfp_IsInf(myfpr *x)
{
return ( ( x->high & 0x7FFF ) == 0x7FFF ) && (0 == ((uint64_t) ( x->low<<1 )));
}
LOCALFUNC bool myfp_IsZero(myfpr *x)
{
return ( ( x->high & 0x7FFF ) == 0x0000 ) && (0 == ((uint64_t) ( x->low<<1 )));
}
LOCALFUNC bool myfp_IsNeg(myfpr *x)
{
return ( ( x->high & 0x8000 ) != 0x0000 );
}
LOCALPROC myfp_Add(myfpr *r, const myfpr *a, const myfpr *b)
{
*r = floatx80_add(*a, *b);
}
LOCALPROC myfp_Sub(myfpr *r, const myfpr *a, const myfpr *b)
{
*r = floatx80_sub(*a, *b);
}
LOCALPROC myfp_Mul(myfpr *r, const myfpr *a, const myfpr *b)
{
*r = floatx80_mul(*a, *b);
}
LOCALPROC myfp_Div(myfpr *r, const myfpr *a, const myfpr *b)
{
*r = floatx80_div(*a, *b);
}
LOCALPROC myfp_Rem(myfpr *r, const myfpr *a, const myfpr *b)
{
*r = floatx80_rem(*a, *b);
}
LOCALPROC myfp_Sqrt(myfpr *r, myfpr *x)
{
*r = floatx80_sqrt(*x);
}
LOCALPROC myfp_Mod(myfpr *r, myfpr *a, myfpr *b)
{
Bit64u q;
*r = floatx80_remainder(*a, *b, &q);
/* should save low byte of q */
}
LOCALPROC myfp_Scale(myfpr *r, myfpr *a, myfpr *b)
{
*r = floatx80_scale(*a, *b);
}
LOCALPROC myfp_GetMan(myfpr *r, myfpr *x)
{
*r = *x;
(void) floatx80_extract(r);
}
LOCALPROC myfp_GetExp(myfpr *r, myfpr *x)
{
floatx80 t0 = *x;
*r = floatx80_extract(&t0);
}
LOCALPROC myfp_floor(myfpr *r, myfpr *x)
{
int8_t SaveRoundingMode = float_rounding_mode;
float_rounding_mode = float_round_down;
*r = floatx80_round_to_int(*x);
float_rounding_mode = SaveRoundingMode;
}
LOCALPROC myfp_IntRZ(myfpr *r, myfpr *x)
{
int8_t SaveRoundingMode = float_rounding_mode;
float_rounding_mode = float_round_to_zero;
*r = floatx80_round_to_int(*x);
float_rounding_mode = SaveRoundingMode;
}
LOCALPROC myfp_Int(myfpr *r, myfpr *x)
{
*r = floatx80_round_to_int(*x);
}
LOCALPROC myfp_RoundToSingle(myfpr *r, myfpr *x)
{
float32 t0 = floatx80_to_float32(*x);
*r = float32_to_floatx80(t0);
}
LOCALPROC myfp_RoundToDouble(myfpr *r, myfpr *x)
{
float64 t0 = floatx80_to_float64(*x);
*r = float64_to_floatx80(t0);
}
LOCALPROC myfp_Abs(myfpr *r, myfpr *x)
{
*r = *x;
r->high &= 0x7FFF;
}
LOCALPROC myfp_Neg(myfpr *r, myfpr *x)
{
*r = *x;
r->high ^= 0x8000;
}
LOCALPROC myfp_TwoToX(myfpr *r, myfpr *x)
{
floatx80 t2;
floatx80 t3;
floatx80 t4;
floatx80 t5;
myfp_floor(&t2, x);
t3 = floatx80_sub(*x, t2);
t4 = f2xm1(t3);
t5 = floatx80_add(t4, floatx80_one);
*r = floatx80_scale(t5, t2);
}
LOCALPROC myfp_TenToX(myfpr *r, myfpr *x)
{
floatx80 t1;
const floatx80 t = /* 1.0 / log(2.0) */
packFloatx80m(0, 0x3fff, LIT64(0xb8aa3b295c17f0bc));
const floatx80 t2 = /* log(10.0) */
packFloatx80m(0, 0x4000, LIT64(0x935d8dddaaa8ac17));
t1 = floatx80_mul(floatx80_mul(*x, t), t2);
myfp_TwoToX(r, &t1);
}
LOCALPROC myfp_EToX(myfpr *r, myfpr *x)
{
floatx80 t1;
const floatx80 t = /* 1.0 / log(2.0) */
packFloatx80m(0, 0x3fff, LIT64(0xb8aa3b295c17f0bc));
t1 = floatx80_mul(*x, t);
myfp_TwoToX(r, &t1);
}
LOCALPROC myfp_EToXM1(myfpr *r, myfpr *x)
{
floatx80 t1;
floatx80 t2;
floatx80 t3;
floatx80 t4;
floatx80 t5;
floatx80 t6;
const floatx80 t = /* 1.0 / log(2.0) */
packFloatx80m(0, 0x3fff, LIT64(0xb8aa3b295c17f0bc));
t1 = floatx80_mul(*x, t);
myfp_floor(&t2, &t1);
t3 = floatx80_sub(t1, t2);
t4 = f2xm1(t3);
if (myfp_IsZero(&t2)) {
*r = t4;
} else {
t5 = floatx80_add(t4, floatx80_one);
t6 = floatx80_scale(t5, t2);
*r = floatx80_sub(t6, floatx80_one);
}
}
LOCALPROC myfp_Log2(myfpr *r, myfpr *x)
{
*r = fyl2x(*x, floatx80_one);
}
LOCALPROC myfp_LogN(myfpr *r, myfpr *x)
{
const floatx80 t = /* log(2.0) */
packFloatx80m(0, 0x3ffe, LIT64(0xb17217f7d1cf79ac));
*r = fyl2x(*x, t);
}
LOCALPROC myfp_Log10(myfpr *r, myfpr *x)
{
const floatx80 t = /* log10(2.0) = ln(2) / ln(10), unknown accuracy */
packFloatx80m(0, 0x3ffd, LIT64(0x9a209a84fbcff798));
*r = fyl2x(*x, t);
}
LOCALPROC myfp_LogNP1(myfpr *r, myfpr *x)
{
const floatx80 t = /* log(2.0) */
packFloatx80m(0, 0x3ffe, LIT64(0xb17217f7d1cf79ac));
*r = fyl2xp1(*x, t);
}
LOCALPROC myfp_Sin(myfpr *r, myfpr *x)
{
(void) fsincos(*x, r, 0);
}
LOCALPROC myfp_Cos(myfpr *r, myfpr *x)
{
(void) fsincos(*x, 0, r);
}
LOCALPROC myfp_Tan(myfpr *r, myfpr *x)
{
*r = *x;
(void) ftan(r);
}
LOCALPROC myfp_ATan(myfpr *r, myfpr *x)
{
*r = fpatan(floatx80_one, *x);
}
LOCALPROC myfp_ASin(myfpr *r, myfpr *x)
{
floatx80 x2 = floatx80_mul(*x, *x);
floatx80 mx2 = floatx80_sub(floatx80_one, x2);
floatx80 cx = floatx80_sqrt(mx2);
*r = fpatan(cx, *x);
}
LOCALPROC myfp_ACos(myfpr *r, myfpr *x)
{
floatx80 x2 = floatx80_mul(*x, *x);
floatx80 mx2 = floatx80_sub(floatx80_one, x2);
floatx80 cx = floatx80_sqrt(mx2);
*r = fpatan(*x, cx);
}
static const floatx80 floatx80_zero =
packFloatx80m(0, 0x0000, LIT64(0x0000000000000000));
static const floatx80 floatx80_Two =
packFloatx80m(0, 0x4000, LIT64(0x8000000000000000));
static const floatx80 floatx80_Ten =
packFloatx80m(0, 0x4002, LIT64(0xa000000000000000));
LOCALPROC myfp_Sinh(myfpr *r, myfpr *x)
{
myfpr ex;
myfpr nx;
myfpr enx;
myfpr t1;
myfp_EToX(&ex, x);
myfp_Neg(&nx, x);
myfp_EToX(&enx, &nx);
myfp_Sub(&t1, &ex, &enx);
myfp_Div(r, &t1, &floatx80_Two);
}
LOCALPROC myfp_Cosh(myfpr *r, myfpr *x)
{
myfpr ex;
myfpr nx;
myfpr enx;
myfpr t1;
myfp_EToX(&ex, x);
myfp_Neg(&nx, x);
myfp_EToX(&enx, &nx);
myfp_Add(&t1, &ex, &enx);
myfp_Div(r, &t1, &floatx80_Two);
}
LOCALPROC myfp_Tanh(myfpr *r, myfpr *x)
{
myfpr x2;
myfpr ex2;
myfpr ex2m1;
myfpr ex2p1;
myfp_Mul(&x2, x, &floatx80_Two);
myfp_EToX(&ex2, &x2);
myfp_Sub(&ex2m1, &ex2, &floatx80_one);
myfp_Add(&ex2p1, &ex2, &floatx80_one);
myfp_Div(r, &ex2m1, &ex2p1);
}
LOCALPROC myfp_ATanh(myfpr *r, myfpr *x)
{
myfpr onepx;
myfpr onemx;
myfpr dv;
myfpr ldv;
myfp_Add(&onepx, x, &floatx80_one);
myfp_Sub(&onemx, x, &floatx80_one);
myfp_Div(&dv, &onepx, &onemx);
myfp_LogN(&ldv, &dv);
myfp_Div(r, &ldv, &floatx80_Two);
}
LOCALPROC myfp_SinCos(myfpr *r_sin, myfpr *r_cos, myfpr *source)
{
(void) fsincos(*source, r_sin, r_cos);
}
LOCALFUNC bool myfp_getCR(myfpr *r, uint16_t opmode)
{
switch (opmode) {
case 0x00:
*r = floatx80_pi; /* M_PI */
break;
case 0x0B:
{
const floatx80 t =
packFloatx80m(0, 0x3ffd, LIT64(0x9a209a84fbcff798));
*r = t; /* log10(2.0) = ln(2) / ln(10), unknown accuracy */
}
break;
case 0x0C:
{
const floatx80 t =
packFloatx80m(0, 0x4000, LIT64(0xadf85458a2bb4a9b));
*r = t; /* exp(1.0), unknown accuracy */
}
break;
case 0x0D:
{
const floatx80 t =
packFloatx80m(0, 0x3fff, LIT64(0xb8aa3b295c17f0bc));
*r = t; /* 1.0 / log(2.0) */
}
break;
case 0x0E:
{
const floatx80 t =
packFloatx80m(0, 0x3ffd, LIT64(0xde5bd8a937287195));
*r = t; /* 1.0 / log(10.0), unknown accuracy */
}
break;
case 0x0F:
*r = floatx80_zero; /* 0.0 */
break;
case 0x30:
{
const floatx80 t =
packFloatx80m(0, 0x3ffe, LIT64(0xb17217f7d1cf79ac));
*r = t; /* log(2.0) */
}
break;
case 0x31:
{
const floatx80 t =
packFloatx80m(0, 0x4000, LIT64(0x935d8dddaaa8ac17));
*r = t; /* log(10.0) */
}
break;
case 0x32:
*r = floatx80_one; /* 1.0 */
break;
case 0x33:
*r = floatx80_Ten; /* 10.0 */
break;
case 0x34:
{
const floatx80 t =
packFloatx80m(0, 0x4005, LIT64(0xc800000000000000));
*r = t; /* 100.0 */
}
break;
case 0x35:
{
const floatx80 t =
packFloatx80m(0, 0x400c, LIT64(0x9c40000000000000));
*r = t; /* 10000.0 */
}
break;
case 0x36:
{
const floatx80 t =
packFloatx80m(0, 0x4019, LIT64(0xbebc200000000000));
*r = t; /* 1.0e8 */
}
break;
case 0x37:
{
const floatx80 t =
packFloatx80m(0, 0x4034, LIT64(0x8e1bc9bf04000000));
*r = t; /* 1.0e16 */
}
break;
case 0x38:
{
const floatx80 t =
packFloatx80m(0, 0x4069, LIT64(0x9dc5ada82b70b59e));
*r = t; /* 1.0e32 */
}
break;
case 0x39:
{
const floatx80 t =
packFloatx80m(0, 0x40d3, LIT64(0xc2781f49ffcfa6d5));
*r = t; /* 1.0e64 */
}
break;
case 0x3A:
{
const floatx80 t =
packFloatx80m(0, 0x41a8, LIT64(0x93ba47c980e98ce0));
*r = t; /* 1.0e128 */
}
break;
case 0x3B:
{
const floatx80 t =
packFloatx80m(0, 0x4351, LIT64(0xaa7eebfb9df9de8e));
*r = t; /* 1.0e256 */
}
break;
case 0x3C:
{
const floatx80 t =
packFloatx80m(0, 0x46a3, LIT64(0xe319a0aea60e91c7));
*r = t; /* 1.0e512 */
}
break;
case 0x3D:
{
const floatx80 t =
packFloatx80m(0, 0x4d48, LIT64(0xc976758681750c17));
*r = t; /* 1.0e1024 */
}
break;
case 0x3E:
{
const floatx80 t =
packFloatx80m(0, 0x5a92, LIT64(0x9e8b3b5dc53d5de5));
*r = t; /* 1.0e2048 */
}
break;
case 0x3F:
{
const floatx80 t =
packFloatx80m(0, 0x7525, LIT64(0xc46052028a20979b));
*r = t; /* 1.0e4096 */
}
break;
default:
return false;
}
return true;
}
/* Floating point control register */
LOCALPROC myfp_SetFPCR(uint32_t v)
{
switch ((v >> 4) & 0x03) {
case 0:
float_rounding_mode = float_round_nearest_even;
break;
case 1:
float_rounding_mode = float_round_to_zero;
break;
case 2:
float_rounding_mode = float_round_down;
break;
case 3:
float_rounding_mode = float_round_up;
break;
}
switch ((v >> 6) & 0x03) {
case 0:
floatx80_rounding_precision = 80;
break;
case 1:
floatx80_rounding_precision = 32;
break;
case 2:
floatx80_rounding_precision = 64;
break;
case 3:
ReportAbnormalID(0x0201,
"Bad rounding precision in myfp_SetFPCR");
floatx80_rounding_precision = 80;
break;
}
if (0 != (v & 0xF)) {
ReportAbnormalID(0x0202,
"Reserved bits not zero in myfp_SetFPCR");
}
}
LOCALFUNC uint32_t myfp_GetFPCR(void)
{
uint32_t v = 0;
switch (float_rounding_mode) {
case float_round_nearest_even:
/* v |= (0 << 4); */
break;
case float_round_to_zero:
v |= (1 << 4);
break;
case float_round_down:
v |= (2 << 4);
break;
case float_round_up:
v |= (3 << 4);
break;
}
if (80 == floatx80_rounding_precision) {
/* v |= (0 << 6); */
} else if (32 == floatx80_rounding_precision) {
v |= (1 << 6);
} else if (64 == floatx80_rounding_precision) {
v |= (2 << 6);
} else {
ReportAbnormalID(0x0203,
"Bad rounding precision in myfp_GetFPCR");
}
return v;
}
LOCALVAR struct myfp_envStruct
{
uint32_t FPSR; /* Floating point status register */
} myfp_env;
LOCALPROC myfp_SetFPSR(uint32_t v)
{
myfp_env.FPSR = v;
}
LOCALFUNC uint32_t myfp_GetFPSR(void)
{
return myfp_env.FPSR;
}
LOCALFUNC uint8_t myfp_GetConditionCodeByte(void)
{
return (myfp_env.FPSR >> 24) & 0x0F;
}
LOCALPROC myfp_SetConditionCodeByte(uint8_t v)
{
myfp_env.FPSR = ((myfp_env.FPSR & 0x00FFFFFF)
| (v << 24));
}
LOCALPROC myfp_SetConditionCodeByteFromResult(myfpr *result)
{
/* Set condition codes here based on result */
int c_nan = myfp_IsNan(result) ? 1 : 0;
int c_inf = myfp_IsInf(result) ? 1 : 0;
int c_zero = myfp_IsZero(result) ? 1 : 0;
int c_neg = myfp_IsNeg(result) ? 1 : 0;
myfp_SetConditionCodeByte(c_nan
| (c_inf << 1)
| (c_zero << 2)
| (c_neg << 3));
}