Implement fma().

This tries to carefully follow the C and IEEE standards regarding rounding, exceptions, etc. Like the other ORCA/C <math.h> functions, there is really just one version that has extended precision, so double rounding is still possible if the result gets assigned to a float or double variable. In addition to the tests I added to the ORCA/C test suite, I have also tested this against (somewhat modified versions of) the following: *FreeBSD fma tests by David Schultz: https://github.com/freebsd/freebsd-src/blob/release/9.3.0/tools/regression/lib/msun/test-fma.c *Tests by Bruno Haible, in the Gnulib test suite and attached to this bug report: https://sourceware.org/bugzilla/show_bug.cgi?id=13304
2025-08-05 15:24:57 +00:00 · 2023-04-02 16:30:29 -05:00
parent 3c1f357b0c
commit 68fc475721
2 changed files with 469 additions and 0 deletions
--- a/math2.asm
+++ b/math2.asm
@@ -1147,6 +1147,430 @@ ret      plx                            clean up stack
         rtl
         end

+****************************************************************
+*
+*  double fma(double x, double y, double z);
+*
+*  Compute (x * y) + z, rounded only once at the end.
+*
+****************************************************************
+*
+fma      start
+fmaf     entry
+fmal     entry
+         using MathCommon2
+mant1    equ    1                       mantissa of value 1
+exp1     equ    mant1+16                exponent of value 1
+sign1    equ    exp1+4                  sign of value 1 (high bit)
+mant2    equ    sign1+2                 mantissa of value 2
+exp2     equ    mant2+16                exponent of value 2
+sign2    equ    exp2+4                  sign of value 2 (low bit)
+expdiff  equ    sign2+2                 difference between exponents
+extra    equ    expdiff+4               extra bits (guard, round, sticky)
+xcps     equ    extra+2                 floating-point exceptions
+
+         csubroutine (10:x,10:y,10:z),54
+
+         stz   extra
+         lda   x                        if x or y is NAN, INF, or 0 then
+         ora   x+2
+         ora   x+4
+         ora   x+6
+         beq   nanInf0                    return (x * y) + z computed with SANE
+         lda   x+8
+         asl   a
+         cmp   #32767*2
+         beq   nanInf0
+
+         lda   y
+         ora   y+2
+         ora   y+4
+         ora   y+6
+         beq   nanInf0
+         lda   y+8
+         asl   a
+         cmp   #32767*2
+         beq   nanInf0
+
+         lda   z+8                      else if z is INF or NAN then
+         asl   a
+         cmp   #32767*2
+         beq   x_plus_z                   return x + z computed with SANE
+         inc   extra
+         lda   z                        else if z is 0 then
+         ora   z+2
+         ora   z+4
+         ora   z+6                        return x * y computed with SANE
+         bne   compute                  else compute fma(x,y,z) ourselves
+
+;
+; Compute with SANE if any operands are NAN/INF/0
+;
+nanInf0  tdc                            if in first or third case above then
+         clc
+         adc   #y
+         pea   0
+         pha
+         adc   #x-y
+         pea   0
+         pha
+         FMULX                            x = x * y
+
+x_plus_z ldy   extra                    if in first or second case above then
+         bne   return_x
+
+         tdc                            
+         clc
+         adc   #z
+         phy
+         pha
+         adc   #x-z
+         phy
+         pha
+         FADDX                            x = x + z
+
+return_x lda   x                        copy result to t1
+         sta   t1
+         lda   x+2
+         sta   t1+2
+         lda   x+4
+         sta   t1+4
+         lda   x+6
+         sta   t1+6
+         lda   x+8
+         sta   t1+8
+         brl   ret                      return result
+
+;
+; Compute it ourselves if all operands are finite and non-zero
+;
+compute  stz   xcps                     no exceptions so far
+         lda   x                        copy mantissa of x to mant1
+         sta   mant1
+         lda   x+2
+         sta   mant1+2
+         lda   x+4
+         sta   mant1+4
+         lda   x+6
+         sta   mant1+6
+         stz   mant1+8
+         stz   mant1+10
+         stz   mant1+12
+         stz   mant1+14
+
+         ldy   #64                      multiply mantissas (64 x 64 to 128-bit)
+ml1      lda   mant1
+         lsr   a
+         bcc   ml2
+         clc                              add multiplicand to partial product
+         lda   mant1+8
+         adc   y
+         sta   mant1+8
+         lda   mant1+10
+         adc   y+2
+         sta   mant1+10
+         lda   mant1+12
+         adc   y+4
+         sta   mant1+12
+         lda   mant1+14
+         adc   y+6
+         sta   mant1+14
+ml2      ror   mant1+14                   shift the interim result
+         ror   mant1+12
+         ror   mant1+10
+         ror   mant1+8
+         ror   mant1+6
+         ror   mant1+4
+         ror   mant1+2
+         ror   mant1
+         dey                              loop until done
+         bne   ml1
+
+         lda   x+8                      calculate exponent
+         asl   a
+         sta   exp1
+         lda   y+8
+         asl   a
+         clc
+         adc   exp1
+         ror   a
+         sta   exp1
+         stz   exp1+2
+         add4  exp1,#-16383+1
+         
+         lda   mant1+14                 normalize calculated value
+         bmi   getsign1
+norm1_lp dec4  exp1
+         asl   mant1
+         rol   mant1+2
+         rol   mant1+4
+         rol   mant1+6
+         rol   mant1+8
+         rol   mant1+10
+         rol   mant1+12
+         rol   mant1+14
+         bpl   norm1_lp
+         
+getsign1 lda   x+8                      get sign of x*y
+         eor   y+8
+         sta   sign1
+         
+         lda   z+8                      get sign of z
+         sta   sign2
+
+         and   #$7fff                   copy exponent of z to exp2
+         sta   exp2
+         stz   exp2+2
+
+         stz   mant2                    copy mantissa of z to mant2
+         stz   mant2+2
+         stz   mant2+4
+         stz   mant2+6
+         lda   z
+         sta   mant2+8
+         lda   z+2
+         sta   mant2+10
+         lda   z+4
+         sta   mant2+12
+         lda   z+6
+         sta   mant2+14
+
+         bmi   exp_cmp                  normalize z value
+norm2_lp dec4  exp2
+         asl   mant2+8                    (low mantissa bits stay 0)
+         rol   mant2+10
+         rol   mant2+12
+         rol   mant2+14
+         bpl   norm2_lp
+
+exp_cmp  cmp4  exp1,exp2                if exp1 < exp2
+         bge   do_align
+         jsr   exchange                   exchange value 1 and value 2
+         
+; at this point, exp1 >= exp2
+do_align stz   extra                    initially extra bits are 0
+         sub4  exp1,exp2,expdiff        expdiff = exp1 - exp2
+         cmpl  expdiff,#65+1            if expdiff > 65 then
+         blt   aligntst
+         stz   mant2                      zero out mant2
+         stz   mant2+2
+         stz   mant2+4
+         stz   mant2+6
+         stz   mant2+8
+         stz   mant2+10
+         stz   mant2+12
+         stz   mant2+14
+         inc   extra                      but set the sticky bit for rounding
+         bra   addorsub                 else
+         
+align_lp dec4  expdiff
+         lsr   mant2+14                   shift mant2 until it is aligned
+         ror   mant2+12
+         ror   mant2+10
+         ror   mant2+8
+         ror   mant2+6
+         ror   mant2+4
+         ror   mant2+2
+         ror   mant2
+         ror   extra
+         bcc   aligntst                   maintain sticky bit
+         lda   #$0001
+         tsb   extra
+aligntst lda   expdiff
+         ora   expdiff+2
+         bne   align_lp 
+
+addorsub lda   sign1                    if signs of x*y and z are the same then
+         eor   sign2
+         bmi   subtract
+
+         clc                              mant1 = mant1 + mant2
+         ldx   #-16
+addLoop  lda   mant1+16,x
+         adc   mant2+16,x
+         sta   mant1+16,x
+         inx
+         inx
+         bmi   addLoop
+         bcc   add_done                   if there is carry out
+         ror   mant1+14                     rotate carry back into result
+         ror   mant1+12
+         ror   mant1+10
+         ror   mant1+8
+         ror   mant1+6
+         ror   mant1+4
+         ror   mant1+2
+         ror   mant1
+         ror   extra
+         bcc   inc_exp                      maintain sticky bit
+         lda   #$0001
+         tsb   extra
+inc_exp  inc4  exp1                         increment exponent
+add_done bra   xtrabits                 else
+
+subtract ldx   #14                        if mant1 < mant2 then
+subCmpLp lda   mant1,x                      (note: only occurs if mant2 was
+         cmp   mant2,x                      not shifted, so extra is 0)
+         bne   sub_cmp
+         dex
+         dex
+         bpl   subCmpLp
+sub_cmp  bge   do_sub
+         jsr   exchange                     exchange mant2 and mant1
+
+do_sub   sec                              mant1 = mant1 - mant2 (including extra)
+         lda   #0
+         sbc   extra
+         sta   extra
+         ldx   #-16
+subLoop  lda   mant1+16,x
+         sbc   mant2+16,x
+         sta   mant1+16,x
+         inx
+         inx
+         bmi   subLoop
+         ora   mant1                      if result (including extra bits) is 0 then
+         ora   mant1+2
+         ora   mant1+4
+         ora   mant1+6
+         ora   mant1+8
+         ora   mant1+10
+         ora   mant1+12
+         ora   extra
+         bne   subalign
+         stz   exp1                         set exponent to 0
+         stz   sign1                        set sign to +
+         FGETENV                            if rounding direction is downward then
+         txa
+         bpl   savezero
+         asl   a
+         bmi   savezero
+         dec   sign1                          set sign to -
+savezero brl   do_save                      skip to return
+subalign lda   mant1+14
+         bmi   xtrabits                   normalize after subtraction, if needed
+subAl_lp dec4  exp1
+         asl   extra
+         rol   mant1
+         rol   mant1+2
+         rol   mant1+4
+         rol   mant1+6
+         rol   mant1+8
+         rol   mant1+10
+         rol   mant1+12
+         rol   mant1+14
+subAlNeg bpl   subAl_lp
+
+xtrabits lda   mant1                    consolidate extra bits (into mant1+6)
+         ora   mant1+2
+         ora   mant1+4
+         ora   extra
+         beq   denorm
+         lda   #$0001
+         tsb   mant1+6
+
+denorm   lda   #INEXACT                 assume INEXACT is just INEXACT
+         bra   denormCk                 while exponent is too small
+denormLp inc4  exp1                       increment exponent
+         lsr   mant1+14                   shift mantissa right
+         ror   mant1+12
+         ror   mant1+10
+         ror   mant1+8
+         ror   mant1+6
+         bcc   denorm2                    maintain sticky bit
+         lda   #$0001
+         tsb   mant1+6
+denorm2  lda   #UNDERFLOW+INEXACT         flag that INEXACT also implies UNDERFLOW
+denormCk ldy   exp1+2                     
+         bmi   denormLp
+
+         ldy   mant1+6                  if there are extra bits then
+         beq   saveval       
+         tsb   xcps                       set inexact (+ maybe underflow) exception
+         FGETENV                          get rounding direction
+         txa
+         asl   a
+         bcs   roundDn0
+         bmi   roundUp                    if rounding to nearest then
+         lda   mant1+6                      if first extra bit is 0
+         bpl   saveval                        do not round
+         asl   a                            else if remaining extra bits are non-zero
+         bne   do_round
+         lda   mant1+8                        or low-order bit of result is 1 then
+         lsr   a
+         bcc   saveval
+         bra   do_round                       apply rounding
+
+roundUp  lda   sign1                      if rounding upward then
+         bmi   saveval                      if positive then
+         bra   do_round                       apply rounding
+
+roundDn0 bmi   saveval                    if rounding downward then
+         lda   sign1                        if negative then
+         bpl   saveval                        apply rounding
+
+do_round inc   mant1+8                    (perform the rounding, if needed)
+         bne   saveval
+         inc   mant1+10
+         bne   saveval
+         inc   mant1+12
+         bne   saveval
+         inc   mant1+14
+         bne   saveval
+         sec
+         ror   mant1+14
+         ror   mant1+12
+         ror   mant1+10
+         ror   mant1+8
+         inc4  exp1
+
+saveval  lda   exp1+2                   if value is too large to represent then
+         bne   save_inf
+         lda   exp1
+         cmp   #32766+1
+         blt   do_save
+save_inf lda   #32767                     set it to infinity
+         sta   exp1
+         stz   mant1+8
+         stz   mant1+10
+         stz   mant1+12
+         stz   mant1+14
+         lda   #OVERFLOW+INEXACT          set overflow and inexact exceptions
+         tsb   xcps
+do_save  lda   mant1+8                  generate result
+         sta   t1
+         lda   mant1+10
+         sta   t1+2
+         lda   mant1+12
+         sta   t1+4
+         lda   mant1+14
+         sta   t1+6
+         lda   exp1
+         asl   a
+         asl   sign1
+         ror   a
+         sta   t1+8
+         
+         lda   xcps                     if there were exceptions then
+         beq   ret
+         pha                              set them in SANE environment
+         FSETXCP
+
+ret      creturn 10:t1                  return t1
+
+; local subroutine - exchange value 1 and value 2
+; Note: requires mant1/exp1/sign1 and mant2/exp2/sign2 to be in order
+exchange ldx   #16+4+2-2
+xchgLp   lda   mant1,x
+         ldy   mant2,x
+         sta   mant2,x
+         sty   mant1,x
+         dex
+         dex
+         bpl   xchgLp
+         rts
+         end
+
 ****************************************************************
 *
 *  double fmax(double x, double y);
--- a/math2.macros
+++ b/math2.macros
@@ -703,3 +703,48 @@
 LDX #$090A
 JSL $E10000
 MEND
+ macro
+&l dec4 &a
+&l ~setm
+ lda &a
+ bne ~&SYSCNT
+ dec 2+&a
+~&SYSCNT dec &a
+ ~restm
+ mend
+ macro
+&l add4 &m1,&m2,&m3
+ lclb &yistwo
+ lclc &c
+&l ~setm
+ aif c:&m3,.a
+&c amid "&m2",1,1
+ aif "&c"<>"#",.a
+&c amid "&m1",1,1
+ aif "&c"="{",.a
+ aif "&c"="[",.a
+&c amid "&m2",2,l:&m2-1
+ aif &c>=65536,.a
+ clc
+ ~lda &m1
+ ~op adc,&m2
+ ~sta &m1
+ bcc ~&SYSCNT
+ ~op.h inc,&m1
+~&SYSCNT anop
+ ago .c
+.a
+ aif c:&m3,.b
+ lclc &m3
+&m3 setc &m1
+.b
+ clc
+ ~lda &m1
+ ~op adc,&m2
+ ~sta &m3
+ ~lda.h &m1
+ ~op.h adc,&m2
+ ~sta.h &m3
+.c
+ ~restm
+ mend