Fixed a million SoftFloat bugs, and an edge case in fscale

SoftFloat's support for subnormal numbers is completely broken on floatx80. Particularly when the result of an op on a subnormal number is normal, and vice versa. I'm not completely sure that my fixes are correct, or that they didn't break anything. I need to do more testing. Also fixed an edge case in fscale that gets weird when the input goes from normal to subnormal... oh, and I still need to fix the opposite case.
2014-11-23 19:41:06 -05:00 · 2014-11-23 19:41:06 -05:00 · 9e91b8067a
parent 8142098fa1
commit 9e91b8067a
3 changed files with 164 additions and 26 deletions
--- a/core/SoftFloat/softfloat-macros.h
+++ b/core/SoftFloat/softfloat-macros.h
@ -1,3 +1,9 @@
+/*
+ * SoftFloat with lots of fixes and modified for use by Shoebill.
+ *
+ * Based on SoftFloat 2b by John R. Hauser.
+ * Modifications by Peter Rutenbar. (pruten@gmail.com)
+ */

 /*============================================================================

@ -152,7 +158,9 @@ INLINE void
        z0 = a0>>count;
    }
    else {
-        z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
+        // [shoebill] This is a bug, right? ( count < 64 ) can never be true
+        // z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
+        z1 = ( count < 128 ) ? ( a0>>( count & 63 ) ) : 0;
        z0 = 0;
    }
    *z1Ptr = z1;
--- a/core/SoftFloat/softfloat.c
+++ b/core/SoftFloat/softfloat.c
@ -1,3 +1,10 @@
+/*
+ * SoftFloat with lots of fixes and modified for use by Shoebill.
+ *
+ * Based on SoftFloat 2b by John R. Hauser.
+ * Modifications by Peter Rutenbar. (pruten@gmail.com)
+ */
+

 /*============================================================================

@ -30,6 +37,9 @@ these four paragraphs for those parts of this code that are retained.

 =============================================================================*/

+// [shoebill]
+// Inlining milieu.h
+
 /*----------------------------------------------------------------------------
 | Include common integer types and flags.
 *----------------------------------------------------------------------------*/
@ -307,6 +317,10 @@ static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
            shift32RightJamming( zSig, - zExp, &zSig );
            zExp = 0;
            roundBits = zSig & 0x7F;
+            // [shoebill]
+            // I think there's a bug here too: if zSig==0x80000080 and zExp==-2,
+            // then that low bit will get lost. And since roundBits is 0,
+            // we won't raise underflow or inexact.
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
        }
    }
@ -376,7 +390,13 @@ INLINE flag extractFloat64Sign( float64 a )
 | significand are stored at the locations pointed to by `zExpPtr' and
 | `zSigPtr', respectively.
 *----------------------------------------------------------------------------*/
-
+// [shoebill]
+// This function left shifts the mantissa until it has an implicit 1 bit
+// just to the left of the mantissa (actually, it produces a value with an explicit bit
+// in that position, but the callers are aware of that.)
+// The exponent it returns is the "true" exponent - as if 0x0000 weren't a special case.
+// It behaves as though exp=0 was an order of magnitude below exp=1, (which is isn't for implicit bit formats),
+// and it returns negative numbers if it needs an even lower magnitude than that.
 static void
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 {
@ -388,6 +408,7 @@ static void

 }

+
 /*----------------------------------------------------------------------------
 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 | double-precision floating-point value, returning the result.  After being
@ -469,6 +490,10 @@ static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
            shift64RightJamming( zSig, - zExp, &zSig );
            zExp = 0;
            roundBits = zSig & 0x3FF;
+            // [shoebill]
+            // I think there's a bug here: if zSig==0x8000000000000400 and zExp==-2,
+            // then that low bit will get lost. And since roundBits is 0,
+            // we won't raise underflow or inexact.
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
        }
    }
@ -551,8 +576,12 @@ static void

    shiftCount = countLeadingZeros64( aSig );
    *zSigPtr = aSig<<shiftCount;
-    *zExpPtr = 1 - shiftCount;
-
+    // [shoebill]
+    // normalizeFloat*Subnormal returns the true normalized exponent.
+    // but for floatx80, with no implicit bit to account for,
+    // shiftCount *is* the true exponent. No need to add one
+    // *zExpPtr = 1 - shiftCount;
+    *zExpPtr = 0 - shiftCount;
 }

 /*----------------------------------------------------------------------------
@ -603,6 +632,10 @@ static floatx80
    flag roundNearestEven, increment, isTiny;
    int64 roundIncrement, roundMask, roundBits;

+    // [shoebill]
+    // printf("roundAndPackFloatx80: prec=%d sign=%d exp=%u sig0=0x%016llx sig1=0x%016llx\n",
+           // roundingPrecision, zSign, zExp, zSig0, zSig1);
+    
    roundingMode = float_rounding_mode;
    roundNearestEven = ( roundingMode == float_round_nearest_even );
    if ( roundingPrecision == 80 ) goto precision80;
@ -644,7 +677,13 @@ static floatx80
                   ( float_detect_tininess == float_tininess_before_rounding )
                || ( zExp < 0 )
                || ( zSig0 <= zSig0 + roundIncrement );
-            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
+            // [shoebill]
+            // Why are we jamming right by one extra bit? If zExp==0, then that's
+            // the "true" exponent
+            //
+            // shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
+            shift64RightJamming( zSig0, -zExp, &zSig0 );
+            
            zExp = 0;
            roundBits = zSig0 & roundMask;
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
@ -711,7 +750,13 @@ static floatx80
                || ( zExp < 0 )
                || ! increment
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
-            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
+            // [shoebill]
+            // Why are we jamming right by one extra bit? If zExp==0, then that's
+            // the "true" exponent
+            //
+            // shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
+            shift64ExtraRightJamming( zSig0, zSig1, -zExp, &zSig0, &zSig1 );
+            
            zExp = 0;
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
@ -874,6 +919,8 @@ static void

 }

+
+
 /*----------------------------------------------------------------------------
 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
@ -983,6 +1030,10 @@ static float128
            shift128ExtraRightJamming(
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
            zExp = 0;
+            // [shoebill]
+            // I think there's a bug here too, like in roundAndPackFloat32/64,
+            // (but it's more complicated since zSig2 has bits jammed into it,
+            //  whereas the other functions use the constant roundBits value.)
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
            if ( roundNearestEven ) {
                increment = ( (sbits64) zSig2 < 0 );
@ -2546,6 +2597,9 @@ static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
            return a;
        }
+        // [shoebill]
+        // This works only by accident. if a and b were subnormal, but (a+b) is normal,
+        // (aSig+bSig) will leak a bit into the exponent, which will correctly set it to 1
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
        zExp = aExp;
@ -3321,7 +3375,24 @@ float128 floatx80_to_float128( floatx80 a )
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
    }
+    
+    // [shoebill]
+    // If the x80 has a non-zero exponent and its explicit bit is 0,
+    // then Motorola's 68881 docs call it "unnormal". packFloat128 can't
+    // handle that, so we need to de-unnormalize it to get a regular
+    // normal or subnormal number.
+    while ((aExp > 0) && ((aSig >> 63)==0)) {
+        aExp--;
+        aSig <<= 1;
+    }

+    // [shoebill]
+    // Don't chew off the x80's explicit bit without checking
+    // the exponent. If it's 0, then the explicit bit may be
+    // either 0 or 1.
+    //
+    // shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
+    
    if (aExp == 0)
        shift128Right( aSig, 0, 16, &zSig0, &zSig1 );
    else
@ -3424,21 +3495,27 @@ static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
    bSig = extractFloatx80Frac( b );
    bExp = extractFloatx80Exp( b );
    expDiff = aExp - bExp;
-    if ( 0 < expDiff ) {
+    if ( 0 < expDiff ) { // [shoebill] aExp is bigger than bExp
        if ( aExp == 0x7FFF ) {
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
            return a;
        }
-        if ( bExp == 0 ) --expDiff;
+        // [shoebill]
+        // vestigial implicit-bit stuff
+        // if ( bExp == 0 ) --expDiff;
+        
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
        zExp = aExp;
    }
-    else if ( expDiff < 0 ) {
+    else if ( expDiff < 0 ) { // [shoebill] aExp is smaller than bExp
        if ( bExp == 0x7FFF ) {
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
        }
-        if ( aExp == 0 ) ++expDiff;
+        // [shoebill]
+        // vestigial implicit-bit stuff
+        // if ( aExp == 0 ) ++expDiff;
+        
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
        zExp = bExp;
    }
@ -3452,7 +3529,21 @@ static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
        zSig1 = 0;
        zSig0 = aSig + bSig;
        if ( aExp == 0 ) {
-            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
+            // [shoebill]
+            // Adding subnormal numbers can possibly overflow, so we need to
+            // detect and handle that.
+            if ((( aSig >> 63 ) &&  ( bSig >> 63 )) ||
+                (( aSig >> 63 ) && !( zSig0 >> 63 )) ||
+                (( bSig >> 63 ) && !( zSig0 >> 63 ))) {
+                zSig1 = zSig0 << 63;
+                zSig0 >>= 1;
+                zSig0 |= LIT64( 0x8000000000000000 );
+                zExp = 1;
+            }
+            else {
+                normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
+            }
+            
            goto roundAndPack;
        }
        zExp = aExp;
@ -3502,10 +3593,15 @@ static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
        z.high = floatx80_default_nan_high;
        return z;
    }
-    if ( aExp == 0 ) {
-        aExp = 1;
-        bExp = 1;
-    }
+    // [shoebill]
+    // All these places where we check if exp==0 and then add one to it
+    // are vestigial implicit-bit stuff
+    //
+    //if ( aExp == 0 ) {
+    //    aExp = 1;
+    //    bExp = 1;
+    //}
+    
    zSig1 = 0;
    if ( bSig < aSig ) goto aBigger;
    if ( aSig < bSig ) goto bBigger;
@ -3515,7 +3611,10 @@ static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
    }
-    if ( aExp == 0 ) ++expDiff;
+    // [shoebill]
+    // vestigial implicit-bit stuff
+    //if ( aExp == 0 ) ++expDiff;
+    
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
 bBigger:
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
@ -3527,7 +3626,9 @@ static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
        return a;
    }
-    if ( bExp == 0 ) --expDiff;
+    // [shoebill]
+    // vestigial implicit-bit stuff
+    //if ( bExp == 0 ) --expDiff;
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
 aBigger:
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
@ -3628,6 +3729,7 @@ floatx80 floatx80_mul( floatx80 a, floatx80 b )
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
    }
+    
    zExp = aExp + bExp - 0x3FFE;
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
    if ( 0 < (sbits64) zSig0 ) {
@ -4350,10 +4452,10 @@ floatx80 float128_to_floatx80( float128 a )
    int32 aExp;
    bits64 aSig0, aSig1;

-    aSig1 = extractFloat128Frac1( a );
-    aSig0 = extractFloat128Frac0( a );
-    aExp = extractFloat128Exp( a );
-    aSign = extractFloat128Sign( a );
+    aSig1 = extractFloat128Frac1( a ); // [shoebill] low
+    aSig0 = extractFloat128Frac0( a ); // [shoebill] high & 0x0000ffffff...
+    aExp = extractFloat128Exp( a ); // [shoebill] (high >> 48) & 0x7fff
+    aSign = extractFloat128Sign( a ); // [shoebill] 1 or 0
    if ( aExp == 0x7FFF ) {
        if ( aSig0 | aSig1 ) {
            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
@ -4363,13 +4465,23 @@ floatx80 float128_to_floatx80( float128 a )
    if ( aExp == 0 ) {
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
+        // [shoebill]
+        // normalizeFloat128Subnormal (exp=0x0000, mant=0x0000 c000...) -> (aExp=0, mant=0x0001 8000...)
+        // normalizeFloat128Subnormal (exp=0x0000, mant=0x0000 4000...) -> (aExp=-1, mant=0x0001 0000...)
+        // if the original exponent was 0x0000, then the implicit bit is 0.
+        // So when making a x80-friendly mantissa, make sure the explicit bit is 0
+        shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
    }
    else {
        aSig0 |= LIT64( 0x0001000000000000 );
+        // [shoebill]
+        shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
    }
-    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
-    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );

+    // [shoebill]
+    // shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
+    
+    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
 }

 #endif
--- a/core/fpu.c
+++ b/core/fpu.c
@ -1876,7 +1876,7 @@ static void inst_fmath_fscale ()
    const _Bool dest_zero = _float128_is_zero(fpu->dest);
    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
    
-    int32_t factor, exponent;
+    int32_t factor, orig_exponent, exponent;
    
    /* If the source is inf, the result is nan and set operr */
    if (source_inf) {
@ -1904,12 +1904,30 @@ static void inst_fmath_fscale ()
    
    assert(float_exception_flags == 0);
    
-    exponent = factor + (int32_t)((fpu->dest.high >> 48) & 0x7fff);
+    orig_exponent = (int32_t)((fpu->dest.high >> 48) & 0x7fff);
+    exponent = factor + orig_exponent;
    
-    if (exponent <= 0) /* not precisely correct, I think - should be '< 0' */
+    if (exponent < 0)
        goto underflow;
    else if (exponent >= 0x7fff)
        goto overflow;
+    else if ((exponent == 0) && (orig_exponent > 0)) {
+        uint64_t m_high;
+        /* 
+         * Edge case: if the 80-bit input was {exp=1, mantissa=1},
+         * the 128-bit version will be {exp=1, mantissa=0}.
+         * If we're subtracting 1, the result will be {exp=0, mantissa=0}
+         * which is not correct. We need to account for the implicit bit.
+         */
+        fpu->result = fpu->dest;
+        fpu->result.low >>= 1;
+        fpu->result.low |= (fpu->result.high << 63);
+        m_high = (fpu->result.high & 0x0000ffffffffffffULL) >> 1;
+        fpu->result.high >>= 63;
+        fpu->result.high <<= 63;
+        fpu->result.high |= m_high;
+        return;
+    }
    
    fpu->result = fpu->dest;
    fpu->result.high &= 0x8000ffffffffffffULL;