of816/asm/mathlib.s


; Math Library - I don't reinvent the wheel here for multiplication, division, etc.
; others have done great work before me and I credit them when I know who did it.

; 32-bit signed comparison
; C and Z reflect same comparision results as CMP instruction
.proc     _stest32
          lda   STACKBASE+6,x
          eor   STACKBASE+2,x
          bpl   samesign
          lda   STACKBASE+2,x       ; Kelvin's excellent solution
          sec
          rol
          rts
samesign: lda   STACKBASE+6,x
          cmp   STACKBASE+2,x
          bcc   :+                  ; less than or not equal, done
          bne   :+
          lda   STACKBASE+4,x
          cmp   STACKBASE+0,x
:         rts
.endproc
_test32   = _stest32::samesign

.proc     _invertay
          pha
          tya
          eor   #$FFFF
          tay
          pla
          eor   #$FFFF
          rts
.endproc

.proc     _negateay
          pha
          tya
          eor   #$FFFF
          clc
          adc   #$0001
          tay
          pla
          eor   #$FFFF
          adc   #$0000
          rts
.endproc

.proc     _invert
          lda   STACKBASE+0,x
          eor   #$FFFF
          sta   STACKBASE+0,x
          lda   STACKBASE+2,x
          eor   #$FFFF
          sta   STACKBASE+2,x
          rts
.endproc

.proc     _negate
          jsr   _invert
          inc   STACKBASE+0,x
          bne   :+
          inc   STACKBASE+2,x
:         rts
.endproc

.proc     _dinvert
          jsr   _invert
          lda   STACKBASE+4,x
          eor   #$FFFF
          sta   STACKBASE+4,x
          lda   STACKBASE+6,x
          eor   #$FFFF
          sta   STACKBASE+6,x
          rts
.endproc

.proc     _dnegate
          jsr   _dinvert
          inc   STACKBASE+4,x
          bne   :+
          inc   STACKBASE+6,x
          bne   :+
          inc   STACKBASE+0,x
          bne   :+
          inc   STACKBASE+2,x
:         rts
.endproc

.proc     _2abs
          bit   STACKBASE+2,x
          bpl   :+
          jsr   _negate
:         jsr   _swap
          ; fall-through
.endproc

.proc     _abs
          bit   STACKBASE+2,x
          bpl   :+
          jsr   _negate
:         rts          
.endproc

.proc     _signum
          ldy   #$FFFF
          lda   STACKBASE+2,x
          bpl   :+
          sty   STACKBASE+2,x
          bra   done
:         iny
          stz   STACKBASE+2,x
          ora   STACKBASE+0,x
          beq   done
          iny
done:     sty   STACKBASE+0,x
          rts
.endproc

; 32-bit unsigned multiplication with 64-bit result
; right-shifting version by dclxvi
; scratch in YR, YR+2 (preserved)
.proc     _umult
N         = YR
          lda   N+2
          pha
          lda   N
          pha
          lda   #$00
          sta   N
          ldy   #32
          lsr   STACKBASE+6,x
          ror   STACKBASE+4,x
l1:       bcc   l2
          clc
          sta   N+2
          lda   N
          adc   STACKBASE+0,x
          sta   N
          lda   N+2
          adc   STACKBASE+2,x
l2:       ror
          ror   N
          ror   STACKBASE+6,x
          ror   STACKBASE+4,x
          dey
          bne   l1
          sta   STACKBASE+2,x
          lda   N
          sta   STACKBASE+0,x
          pla
          sta   N
          pla
          sta   N+2
          rts
.endproc

; 64-bit divided by 32-bit with 32-bit quotient and remainder
; Adapted from Garth's routine, just like everyone else :-)
; carry set if divison by zero or overflow
; ( d n -- r q )
; d.hi = stack(4,6), d.low = stack(8,10), n=stack(0,2)
.proc     _umdivmod
CARRY     = YR
SCRATCH   = YR+2
.if 1 ; shortcut 32-bit by 32-bit division
          lda   STACKBASE+4,x
          ora   STACKBASE+6,x
          beq   _udivmod32          ; go do faster 32-bit divide
.endif
          lda   SCRATCH
          pha
          lda   CARRY
          pha
          sec                       ; first, check for overflow and division by 0
          lda   STACKBASE+4,x
          sbc   STACKBASE+0,x
          lda   STACKBASE+6,x
          sbc   STACKBASE+2,x
          bcs   overflow
          lda   #33                 ; 32 bits + 1
          sta   XR
loop:     rol   STACKBASE+8,x
          rol   STACKBASE+10,x
          dec   XR
          beq   done
          rol   STACKBASE+4,x
          rol   STACKBASE+6,x
          stz   CARRY
          rol   CARRY
          sec
          lda   STACKBASE+4,x
          sbc   STACKBASE+0,x
          sta   SCRATCH
          lda   STACKBASE+6,x
          sbc   STACKBASE+2,x
          tay
          lda   CARRY
          sbc   #0
          bcc   loop
          lda   SCRATCH
          sta   STACKBASE+4,x
          sty   STACKBASE+6,x
          bra   loop
overflow: sec
          bra   done1
done:     clc
          inx                       ; drop
          inx
          inx
          inx
done1:    pla
          sta   CARRY
          pla
          sta   SCRATCH
          bcs   :+                  ; leave stack intact if exception
          jmp   _swap1
:         rts
.endproc

; 32-bit by 32-bit division
; assumes that the second stack entry is zero
; ( d n -- r q ) where d.hi is zero e.g. ( n1 0 n2 -- r q )
; d.hi = stack(4,6) = 0, d.low = n1 = stack(8,10), n2 = stack(0,2)
.proc     _udivmod32
          lda   #32
          sta   XR
l1:       asl   STACKBASE+8,x       ; shift high bit of n1 into r
          rol   STACKBASE+10,x      ; clearing the low bit for q
          rol   STACKBASE+4,x       ; r.lo
          rol   STACKBASE+6,x       ; r.hi
          lda   STACKBASE+4,x       ; r.lo
          sec                       ; trial subraction
          sbc   STACKBASE+0,x       ; n2.lo
          tay                       ; save low word
          lda   STACKBASE+6,x       ; r.hi
          sbc   STACKBASE+2,x       ; n2.hi
          bcc   l2                  ; subtraction succeeded?
          sta   STACKBASE+6,x       ; r.hi yes, save result
          sty   STACKBASE+4,x       ; r.lo
          inc   STACKBASE+8,x       ; n1.lo and record a 1 in the quotient
l2:       dec   XR                  ; next bit
          bne   l1
          inx                       ; kill of top stack item
          inx
          inx
          inx
          clc                       ; this *never* overflows
          jmp   _swap1
.endproc

; ( d n -- ud u )
.proc     _dnabs
          lda   STACKBASE+2,x     ; take absolute value of n1
          bpl   :+                ; if needed
          jsr   _negate
:         lda   STACKBASE+6,x     ; take absolute value of d
          bpl   :+                ; if needed
dtneg:    inx
          inx
          inx
          inx
          jsr   _dnegate
          dex
          dex
          dex
          dex
:         rts
.endproc
_dtucknegate = _dnabs::dtneg

.proc     _smdivrem
          lda   STACKBASE+6,x     ; save dividend sign in MSW of high cell of d
          pha
          eor   STACKBASE+2,x     ; compute result sign and save
          pha
          jsr   _dnabs            ; take absolute value of arguments
          jsr   _umdivmod
          bcs   overflow          ; overflow
          pla                     ; see if we should negate quotient
          bpl   :+
          jsr   _negate           ; make it negative
:         pla                     ; get dividend sign
          bpl   :+
tneg:     inx                     ; negate remainder if it should be negative
          inx
          inx
          inx
          jsr   _negate
          dex
          dex
          dex
          dex
:         clc
          rts
overflow: pla                     ; carry is set, pla does not affect it
          pla
          rts
.endproc
_tucknegate = _smdivrem::tneg

; ( d1 n1 -- n2 n3 )
; Divide dividend d1 by divisor n1, giving the floored quotient n3 and the remainder n2.
; Input and output stack arguments are signed.
.proc     _fmdivmod
          lda   STACKBASE+2,x     ; high word of n1
          pha                     ; save divisor sign
          bpl   :+
          jsr   _negate           ; negate both if n1 was negative
          jsr   _dtucknegate
:         lda   STACKBASE+6,x     ; highest word of d1
          bpl   :+
          lda   STACKBASE+0,x     ; it's negative, add n1 to high cell of d1
          clc
          adc   STACKBASE+4,x
          sta   STACKBASE+4,x
          lda   STACKBASE+2,x
          adc   STACKBASE+6,x
          sta   STACKBASE+6,x
:         jsr   _umdivmod         ; UM/MOD
          pla                     ; get divisor sign back (carry unaffected)
          bcs   :+                ; error
          bpl   :+                ; if n1 was positive
          jsr   _tucknegate       ; if it was, negate the result (clears carry)
:         rts
.endproc

; adapted from Lee Davidson routine
; ( u1 -- u1 u2 ) u1 = closest integer <= square root, u2 = remainder
; number popped into WR,WR+2
; remainder on stack, offsets 0,2
; root on stack, offsets 4,6
; temp in YR
; counter in XR
.proc     _sqroot
          jsr   _peekwr           ; get number into WR
          jsr   _stackdecr        ; make room for remainder
          lda   #16               ; pairs of bits
          sta   XR                ; counter
          lda   #$0000
          sta   STACKBASE+0,x     ; init remainder
          sta   STACKBASE+2,x
          sta   STACKBASE+4,x     ; init root
          sta   STACKBASE+6,x

lp:       asl   STACKBASE+4,x     ; root = root * 2
          asl   WR                ; now shift 2 bits of number into remainder
          rol   WR+2
          rol   STACKBASE+0,x
          rol   STACKBASE+2,x
          asl   WR
          rol   WR+2
          rol   STACKBASE+0,x
          rol   STACKBASE+2,x
          lda   STACKBASE+4,x     ; copy root into temp
          sta   YR
          lda   STACKBASE+6,x     ; (a bit shorter than immediate load)
          sta   YR+2
          sec                     ; +1
          rol   YR                ; temp = temp * 2 + 1
          rol   YR+2
          lda   STACKBASE+2,x     ; compare remainder with partial
          cmp   YR+2
          bcc   next              ; skip sub if remainder smaller
          bne   subtr             ; but do it if equal
          lda   STACKBASE+0,x     
          cmp   YR
          bcc   next              ; same deal
subtr:    lda   STACKBASE+0,x     ; subtract partial from remainder
          sbc   YR
          sta   STACKBASE+0,x
          lda   STACKBASE+2,X
          sbc   YR+2
          sta   STACKBASE+2,x
          inc   STACKBASE+4,x     ; no need to increment high word, always zero
next:     dec   XR
          bne   lp
          rts
.endproc
initial full commit 2019-07-01 17:33:44 +00:00
			`; Math Library - I don't reinvent the wheel here for multiplication, division, etc.`
			`; others have done great work before me and I credit them when I know who did it.`

			`; 32-bit signed comparison`
			`; C and Z reflect same comparision results as CMP instruction`
			`.proc _stest32`
			`lda STACKBASE+6,x`
			`eor STACKBASE+2,x`
			`bpl samesign`
resolve #4, h/t ksherlock 2020-01-04 03:41:25 +00:00			`lda STACKBASE+2,x ; Kelvin's excellent solution`
			`sec`
			`rol`
initial full commit 2019-07-01 17:33:44 +00:00			`rts`
			`samesign: lda STACKBASE+6,x`
			`cmp STACKBASE+2,x`
			`bcc :+ ; less than or not equal, done`
			`bne :+`
			`lda STACKBASE+4,x`
			`cmp STACKBASE+0,x`
			`: rts`
			`.endproc`
resolve #6; general clean-up 2020-01-05 00:25:41 +00:00			`_test32 = _stest32::samesign`
initial full commit 2019-07-01 17:33:44 +00:00
			`.proc _invertay`
			`pha`
			`tya`
			`eor #$FFFF`
			`tay`
			`pla`
			`eor #$FFFF`
			`rts`
			`.endproc`

			`.proc _negateay`
			`pha`
			`tya`
			`eor #$FFFF`
			`clc`
			`adc #$0001`
			`tay`
			`pla`
			`eor #$FFFF`
			`adc #$0000`
			`rts`
			`.endproc`

			`.proc _invert`
			`lda STACKBASE+0,x`
			`eor #$FFFF`
			`sta STACKBASE+0,x`
			`lda STACKBASE+2,x`
			`eor #$FFFF`
			`sta STACKBASE+2,x`
			`rts`
			`.endproc`

			`.proc _negate`
			`jsr _invert`
			`inc STACKBASE+0,x`
			`bne :+`
			`inc STACKBASE+2,x`
			`: rts`
			`.endproc`

			`.proc _dinvert`
			`jsr _invert`
			`lda STACKBASE+4,x`
			`eor #$FFFF`
			`sta STACKBASE+4,x`
			`lda STACKBASE+6,x`
			`eor #$FFFF`
			`sta STACKBASE+6,x`
			`rts`
			`.endproc`

			`.proc _dnegate`
			`jsr _dinvert`
			`inc STACKBASE+4,x`
			`bne :+`
			`inc STACKBASE+6,x`
			`bne :+`
			`inc STACKBASE+0,x`
			`bne :+`
			`inc STACKBASE+2,x`
			`: rts`
			`.endproc`

			`.proc _2abs`
			`bit STACKBASE+2,x`
			`bpl :+`
			`jsr _negate`
			`: jsr _swap`
			`; fall-through`
			`.endproc`

			`.proc _abs`
			`bit STACKBASE+2,x`
			`bpl :+`
			`jsr _negate`
			`: rts`
			`.endproc`

			`.proc _signum`
			`ldy #$FFFF`
			`lda STACKBASE+2,x`
			`bpl :+`
			`sty STACKBASE+2,x`
			`bra done`
			`: iny`
			`stz STACKBASE+2,x`
			`ora STACKBASE+0,x`
			`beq done`
			`iny`
			`done: sty STACKBASE+0,x`
			`rts`
			`.endproc`

			`; 32-bit unsigned multiplication with 64-bit result`
			`; right-shifting version by dclxvi`
			`; scratch in YR, YR+2 (preserved)`
			`.proc _umult`
			`N = YR`
			`lda N+2`
			`pha`
			`lda N`
			`pha`
			`lda #$00`
			`sta N`
			`ldy #32`
			`lsr STACKBASE+6,x`
			`ror STACKBASE+4,x`
			`l1: bcc l2`
			`clc`
			`sta N+2`
			`lda N`
			`adc STACKBASE+0,x`
			`sta N`
			`lda N+2`
			`adc STACKBASE+2,x`
			`l2: ror`
			`ror N`
			`ror STACKBASE+6,x`
			`ror STACKBASE+4,x`
			`dey`
			`bne l1`
			`sta STACKBASE+2,x`
			`lda N`
			`sta STACKBASE+0,x`
			`pla`
			`sta N`
			`pla`
			`sta N+2`
			`rts`
			`.endproc`

			`; 64-bit divided by 32-bit with 32-bit quotient and remainder`
			`; Adapted from Garth's routine, just like everyone else :-)`
			`; carry set if divison by zero or overflow`
			`; ( d n -- r q )`
			`; d.hi = stack(4,6), d.low = stack(8,10), n=stack(0,2)`
			`.proc _umdivmod`
			`CARRY = YR`
			`SCRATCH = YR+2`
			`.if 1 ; shortcut 32-bit by 32-bit division`
			`lda STACKBASE+4,x`
			`ora STACKBASE+6,x`
			`beq _udivmod32 ; go do faster 32-bit divide`
			`.endif`
			`lda SCRATCH`
			`pha`
			`lda CARRY`
			`pha`
			`sec ; first, check for overflow and division by 0`
			`lda STACKBASE+4,x`
			`sbc STACKBASE+0,x`
			`lda STACKBASE+6,x`
			`sbc STACKBASE+2,x`
			`bcs overflow`
			`lda #33 ; 32 bits + 1`
			`sta XR`
			`loop: rol STACKBASE+8,x`
			`rol STACKBASE+10,x`
			`dec XR`
			`beq done`
			`rol STACKBASE+4,x`
			`rol STACKBASE+6,x`
			`stz CARRY`
			`rol CARRY`
			`sec`
			`lda STACKBASE+4,x`
			`sbc STACKBASE+0,x`
			`sta SCRATCH`
			`lda STACKBASE+6,x`
			`sbc STACKBASE+2,x`
			`tay`
			`lda CARRY`
			`sbc #0`
			`bcc loop`
			`lda SCRATCH`
			`sta STACKBASE+4,x`
			`sty STACKBASE+6,x`
			`bra loop`
			`overflow: sec`
			`bra done1`
			`done: clc`
			`inx ; drop`
			`inx`
			`inx`
			`inx`
			`done1: pla`
			`sta CARRY`
			`pla`
			`sta SCRATCH`
			`bcs :+ ; leave stack intact if exception`
			`jmp _swap1`
			`: rts`
			`.endproc`

			`; 32-bit by 32-bit division`
			`; assumes that the second stack entry is zero`
			`; ( d n -- r q ) where d.hi is zero e.g. ( n1 0 n2 -- r q )`
			`; d.hi = stack(4,6) = 0, d.low = n1 = stack(8,10), n2 = stack(0,2)`
			`.proc _udivmod32`
			`lda #32`
			`sta XR`
			`l1: asl STACKBASE+8,x ; shift high bit of n1 into r`
			`rol STACKBASE+10,x ; clearing the low bit for q`
			`rol STACKBASE+4,x ; r.lo`
			`rol STACKBASE+6,x ; r.hi`
			`lda STACKBASE+4,x ; r.lo`
			`sec ; trial subraction`
			`sbc STACKBASE+0,x ; n2.lo`
			`tay ; save low word`
			`lda STACKBASE+6,x ; r.hi`
			`sbc STACKBASE+2,x ; n2.hi`
			`bcc l2 ; subtraction succeeded?`
			`sta STACKBASE+6,x ; r.hi yes, save result`
			`sty STACKBASE+4,x ; r.lo`
			`inc STACKBASE+8,x ; n1.lo and record a 1 in the quotient`
			`l2: dec XR ; next bit`
			`bne l1`
			`inx ; kill of top stack item`
			`inx`
			`inx`
			`inx`
			`clc ; this never overflows`
			`jmp _swap1`
			`.endproc`

			`; ( d n -- ud u )`
			`.proc _dnabs`
			`lda STACKBASE+2,x ; take absolute value of n1`
fix #5 2020-03-23 00:11:32 +00:00			`bpl :+ ; if needed`
initial full commit 2019-07-01 17:33:44 +00:00			`jsr _negate`
			`: lda STACKBASE+6,x ; take absolute value of d`
fix #5 2020-03-23 00:11:32 +00:00			`bpl :+ ; if needed`
initial full commit 2019-07-01 17:33:44 +00:00			`dtneg: inx`
			`inx`
			`inx`
			`inx`
			`jsr _dnegate`
			`dex`
			`dex`
			`dex`
			`dex`
			`: rts`
			`.endproc`
			`_dtucknegate = _dnabs::dtneg`

			`.proc _smdivrem`
			`lda STACKBASE+6,x ; save dividend sign in MSW of high cell of d`
			`pha`
			`eor STACKBASE+2,x ; compute result sign and save`
			`pha`
			`jsr _dnabs ; take absolute value of arguments`
			`jsr _umdivmod`
			`bcs overflow ; overflow`
			`pla ; see if we should negate quotient`
			`bpl :+`
			`jsr _negate ; make it negative`
			`: pla ; get dividend sign`
			`bpl :+`
			`tneg: inx ; negate remainder if it should be negative`
			`inx`
			`inx`
			`inx`
			`jsr _negate`
			`dex`
			`dex`
			`dex`
			`dex`
			`: clc`
			`rts`
			`overflow: pla ; carry is set, pla does not affect it`
			`pla`
			`rts`
			`.endproc`
			`_tucknegate = _smdivrem::tneg`

fix #5 2020-03-23 00:11:32 +00:00			`; ( d1 n1 -- n2 n3 )`
			`; Divide dividend d1 by divisor n1, giving the floored quotient n3 and the remainder n2.`
			`; Input and output stack arguments are signed.`
initial full commit 2019-07-01 17:33:44 +00:00			`.proc _fmdivmod`
fix #5 2020-03-23 00:11:32 +00:00			`lda STACKBASE+2,x ; high word of n1`
			`pha ; save divisor sign`
initial full commit 2019-07-01 17:33:44 +00:00			`bpl :+`
fix #5 2020-03-23 00:11:32 +00:00			`jsr _negate ; negate both if n1 was negative`
			`jsr _dtucknegate`
			`: lda STACKBASE+6,x ; highest word of d1`
initial full commit 2019-07-01 17:33:44 +00:00			`bpl :+`
fix #5 2020-03-23 00:11:32 +00:00			`lda STACKBASE+0,x ; it's negative, add n1 to high cell of d1`
initial full commit 2019-07-01 17:33:44 +00:00			`clc`
			`adc STACKBASE+4,x`
			`sta STACKBASE+4,x`
			`lda STACKBASE+2,x`
			`adc STACKBASE+6,x`
			`sta STACKBASE+6,x`
fix #5 2020-03-23 00:11:32 +00:00			`: jsr _umdivmod ; UM/MOD`
			`pla ; get divisor sign back (carry unaffected)`
			`bcs :+ ; error`
			`bpl :+ ; if n1 was positive`
			`jsr _tucknegate ; if it was, negate the result (clears carry)`
initial full commit 2019-07-01 17:33:44 +00:00			`: rts`
			`.endproc`

			`; adapted from Lee Davidson routine`
			`; ( u1 -- u1 u2 ) u1 = closest integer <= square root, u2 = remainder`
			`; number popped into WR,WR+2`
			`; remainder on stack, offsets 0,2`
			`; root on stack, offsets 4,6`
			`; temp in YR`
			`; counter in XR`
			`.proc _sqroot`
			`jsr _peekwr ; get number into WR`
			`jsr _stackdecr ; make room for remainder`
			`lda #16 ; pairs of bits`
			`sta XR ; counter`
			`lda #$0000`
			`sta STACKBASE+0,x ; init remainder`
			`sta STACKBASE+2,x`
			`sta STACKBASE+4,x ; init root`
			`sta STACKBASE+6,x`

			`lp: asl STACKBASE+4,x ; root = root * 2`
			`asl WR ; now shift 2 bits of number into remainder`
			`rol WR+2`
			`rol STACKBASE+0,x`
			`rol STACKBASE+2,x`
			`asl WR`
			`rol WR+2`
			`rol STACKBASE+0,x`
			`rol STACKBASE+2,x`
			`lda STACKBASE+4,x ; copy root into temp`
			`sta YR`
			`lda STACKBASE+6,x ; (a bit shorter than immediate load)`
			`sta YR+2`
			`sec ; +1`
			`rol YR ; temp = temp * 2 + 1`
			`rol YR+2`
			`lda STACKBASE+2,x ; compare remainder with partial`
			`cmp YR+2`
			`bcc next ; skip sub if remainder smaller`
			`bne subtr ; but do it if equal`
			`lda STACKBASE+0,x`
			`cmp YR`
			`bcc next ; same deal`
			`subtr: lda STACKBASE+0,x ; subtract partial from remainder`
			`sbc YR`
			`sta STACKBASE+0,x`
			`lda STACKBASE+2,X`
			`sbc YR+2`
			`sta STACKBASE+2,x`
			`inc STACKBASE+4,x ; no need to increment high word, always zero`
			`next: dec XR`
			`bne lp`
			`rts`
			`.endproc`