Optimize lgamma.

This includes adjustments that make it a little faster for most input values, but especially for |x| < 7. The impact on accuracy should be minimal.
Implement lgamma (C99).
2023-05-22 18:05:15 -05:00 · 2023-05-21 18:24:01 -05:00
2 changed files with 372 additions and 0 deletions
--- a/math2.asm
+++ b/math2.asm
@ -1906,6 +1906,373 @@ ret      creturn 2:x                    return it
         rtl
         end

+****************************************************************
+*
+*  double lgamma(double x);
+*
+*  Computes the natural logarithm of the absolute value of the
+*  gamma function of x.
+*
+*  The rational approximations used near 1 and 2 are from
+*  W. J. Cody and K. E. Hillstrom, "Chebyshev Approximations
+*  for the Natural Logarithm of the Gamma Function".
+*  For other positive x values, a computation based on
+*  Stirling's formula is used.
+*
+****************************************************************
+*
+lgamma   start
+lgammaf  entry
+lgammal  entry
+         using MathCommon2
+
+         csubroutine (10:x),0
+
+         phb
+         phk
+         plb
+
+         pha                            save env & set to default
+         tsc
+         inc   a
+         pea   0
+         pha
+         FPROCENTRY
+         
+         stz   reflect                  assume no reflection
+
+; For x < 0, lgamma(x) = ln(pi) - ln(abs(x*sin(pi*x))) - lgamma(-x)
+         lda   x+8                      if x is negative then
+         jpl   positive
+         
+         lda   x                          if x != -0 and x != -inf then
+         ora   x+2
+         ora   x+4
+         ora   x+6
+         jeq   abs_x
+   
+         inc   reflect                      flag to do reflection
+         
+         ldx   #8                           neg_adj := x
+lp1      lda   x,X
+         sta   neg_adj,X
+         dex
+         dex
+         bpl   lp1
+
+         ph4   #two                         neg_adj := neg_adj REM 2
+         ph4   #neg_adj
+         FREMI
+         
+         ph4   #pi                          neg_adj := sin(neg_adj*pi)
+         ph4   #neg_adj
+         FMULX
+         ph4   #neg_adj
+         FSINX
+         
+         tdc                                neg_adj := neg_adj * x
+         clc
+         adc   #x
+         pea   0
+         pha
+         ph4   #neg_adj
+         FMULX
+         
+         asl   neg_adj+8                    neg_adj := abs(neg_adj)
+         lsr   neg_adj+8
+         
+         ph4   #neg_adj                     neg_adj := ln(neg_adj)
+         FLNX
+         
+         ph4   #ln_pi                       neg_adj := neg_adj - ln(pi)
+         ph4   #neg_adj
+         FSUBX
+         
+         lda   neg_adj+8                    if neg_adj is inf/nan then
+         asl   a
+         cmp   #32767*2
+         bne   abs_x
+
+         ldx   #8                             z := neg_adj
+lp2      lda   neg_adj,X
+         sta   z,X
+         dex
+         dex
+         bpl   lp2
+         
+         brl   ret_negz                       return -z
+
+abs_x    asl   x+8                        x := abs(x)
+         lsr   x+8
+
+positive ldx   #8                       t1 := x
+lp3      lda   x,X
+         sta   t1,X
+         dex
+         dex
+         bpl   lp3
+
+         ph4   #rat2_hi                 if x is near 1 or 2 then
+         ph4   #t1
+         FCMPS
+         jvc   not_near_1_or_2
+
+; Rational approximation for x near 2
+         ph4   #rat2_lo                   if x is near 2 then
+         ph4   #t1
+         FCMPS
+         bvs   small
+
+         ph4   #z                           z := P2(t1)
+         pea   7
+         ph4   #P2
+         jsl   poly
+
+         ph4   #y                           y := Q2(t1)
+         pea   7
+         ph4   #Q2
+         jsl   poly
+         
+         ph4   #two                         N := 2
+         bra   finish_rational_approx
+
+; Rational approximation for x near 1
+small    ph4   #rat1_hi                   else if x is near 1 then
+         ph4   #t1
+         FCMPS
+         jvc   not_near_1_or_2
+
+         ph4   #rat1_lo
+         ph4   #t1
+         FCMPS
+         bvs   not_near_1_or_2
+
+         ph4   #z                           z := P1(t1)
+         pea   7
+         ph4   #P1
+         jsl   poly
+
+         ph4   #y                           y := Q1(t1)
+         pea   7
+         ph4   #Q1
+         jsl   poly
+         
+         ph4   #one                         N := 1
+
+finish_rational_approx anop 
+         ph4   #t1                        t1 := x - N
+         FSUBI
+
+         ph4   #y                         z := z/y
+         ph4   #z
+         FDIVX
+         
+         ph4   #t1                        z := z * t1
+         ph4   #z
+         FMULX
+         brl   chk_reflect                goto chk_reflect
+
+; Calculate based on Stirling's formula for other x values
+not_near_1_or_2 anop
+         stz   scaled                   assume no scaling used
+
+         stz   z                        z := 1.0
+         stz   z+2
+         stz   z+4
+         lda   #$8000
+         sta   z+6
+         lda   #16383
+         sta   z+8
+
+chk_big  ph4   #cutoff                  while x < 9.41796875 do
+         tdc
+         clc
+         adc   #x
+         pea   0
+         pha
+         FCMPS
+         bvc   stirling
+
+         inc   scaled                     flag that scaling was used
+
+         tdc                              z := z * x
+         clc
+         adc   #x
+         pea   0
+         pha
+         ph4   #z
+         FMULX
+
+         ph4   #one                       x := x + 1
+         tdc
+         clc
+         adc   #x
+         pea   0
+         pha
+         FADDI
+         bra   chk_big
+
+stirling ldx   #8                       t1 := x
+lp4      lda   x,X                      y := x
+         sta   t1,X
+         sta   y,X
+         dex
+         dex
+         bpl   lp4
+
+         ph4   #t1                      t1 := ln(t1)
+         FLNX
+
+         ph4   #one_half                y := y - 1/2
+         ph4   #y
+         FSUBS
+
+         ph4   #t1                      y := t1 * y
+         ph4   #y
+         FMULX
+
+         lda   y+8                      if y is not inf/nan then
+         asl   a
+         cmp   #32767*2
+         beq   stirling_poly
+
+         tdc                              y := y - x
+         clc
+         adc   #x
+         pea   0
+         pha
+         ph4   #y
+         FSUBX
+         
+         ph4   #half_ln_2pi               y := y + 0.5*ln(2pi)
+         ph4   #y
+         FADDX
+         
+         lda   scaled                     if scaled then
+         beq   stirling_poly
+
+         ph4   #z                           z := ln(z)
+         FLNX
+         
+         ph4   #z                           y := y - z
+         ph4   #y
+         FSUBX
+
+stirling_poly anop
+         ldx   #8                       t1 := x
+lp5      lda   x,X
+         sta   t1,X
+         dex
+         dex
+         bpl   lp5
+
+         pea   -2                       t1 := 1/t1^2
+         ph4   #t1
+         FXPWRI
+
+         ph4   #z                       z := P_stirling(t1)
+         pea   8
+         ph4   #P_stirling
+         jsl   poly
+
+         tdc                            z := z / x
+         clc
+         adc   #x
+         pea   0
+         pha
+         ph4   #z
+         FDIVX
+         
+         ph4   #y                       z := y + z
+         ph4   #z
+         FADDX
+
+chk_reflect anop
+         lda   reflect                  if reflection is needed then
+         beq   done
+
+         ph4   #neg_adj                   z := z + neg_adj
+         ph4   #z
+         FADDX
+         
+ret_negz lda   z+8                        z := -z
+         eor   #$8000
+         sta   z+8
+
+done     FPROCEXIT                      restore env & raise any new exceptions
+         plb
+         creturn 10:z                   return a pointer to the result
+
+; constants
+cutoff   dc    f'9.41796875'            above this, just use Stirling's formula
+;low/high limits for use of rational approximations near 1 and 2
+rat1_lo  dc    f'0.97265625'
+rat1_hi  dc    f'1.02880859375'
+rat2_lo  dc    f'1.76220703125'
+rat2_hi  dc    f'2.208984375'
+
+one_half dc    f'0.5'
+half_ln_2pi dc e'0.9189385332046727417803297'
+one      dc    i2'1'
+two      dc    i2'2'
+ln_pi    dc    e'1.14472988584940017414342735'
+pi       dc    e'3.1415926535897932384626433'
+
+; coefficients for rational approximation, 0.5 <= x <= 1.5
+P1       dc    e'4.120843185847770031e00'
+         dc    e'8.568982062831317339e01'
+         dc    e'2.431752435244210223e02'
+         dc    e'-2.617218583856145190e02'
+         dc    e'-9.222613728801521582e02'
+         dc    e'-5.176383498023217924e02'
+         dc    e'-7.741064071332953034e01'
+         dc    e'-2.208843997216182306e00'
+Q1       dc    e'1.000000000000000000e00'
+         dc    e'4.564677187585907957e01'
+         dc    e'3.778372484823942081e02'
+         dc    e'9.513235976797059772e02'
+         dc    e'8.460755362020782006e02'
+         dc    e'2.623083470269460180e02'
+         dc    e'2.443519662506311704e01'
+         dc    e'4.097792921092615065e-01'
+; coefficients for rational approximation, 1.5 <= x <= 4.0
+P2       dc    e'5.1550576176408171704e00'
+         dc    e'3.7751067979721702241e02'
+         dc    e'5.2689832559149812458e03'
+         dc    e'1.9553605540630449846e04'
+         dc    e'1.2043173809871640151e04'
+         dc    e'-2.0648294205325283281e04'
+         dc    e'-1.5086302287667250272e04'
+         dc    e'-1.5138318341150667785e03'
+Q2       dc    e'1.0000000000000000000e00'
+         dc    e'1.2890931890129576873e02'
+         dc    e'3.0399030414394398824e03'
+         dc    e'2.2029562144156636889e04'
+         dc    e'5.7120255396025029854e04'
+         dc    e'5.2622863838411992470e04'
+         dc    e'1.4402090371700852304e04'
+         dc    e'6.9832741405735102159e02'
+; coefficients for Stirling series approximation
+P_stirling anop
+         dc    e'0.1796443723688305731649385'
+         dc    e'-0.02955065359477124183006536'
+         dc    e'0.006410256410256410256410256'
+         dc    e'-0.001917526917526917526917527'
+         dc    e'0.0008417508417508417508417508'
+         dc    e'-0.0005952380952380952380952381'
+         dc    e'0.0007936507936507936507936508'
+         dc    e'-0.002777777777777777777777778'
+         dc    e'0.08333333333333333333333333'
+
+; temporaries/return values
+y        ds    10
+z        ds    10
+
+neg_adj  ds    10                       adjustment for negative x
+reflect  ds    2                        reflection flag
+scaled   ds    2                        scaling flag
+         end
+
 ****************************************************************
 *
 *  long long llrint(double x);
--- a/math2.macros
+++ b/math2.macros
@ -748,3 +748,8 @@
 .c
 ~restm
 mend
+ macro
+&l jvc &bp
+&l bvs *+5
+ brl &bp
+ mend