boot3/OS/FPUEmulation/RemMod.a
Elliot Nunn 5b0f0cc134 Bring in CubeE sources
Resource forks are included only for .rsrc files. These are DeRezzed into their data fork. 'ckid' resources, from the Projector VCS, are not included.

The Tools directory, containing mostly junk, is also excluded.
2017-12-26 10:02:57 +08:00

401 lines
11 KiB
Plaintext

;
; File: RemMod.a
;
; Contains: Routines to handle emulation of FREM and FMOD instructions
; (floating-point remainder and modulus functions)
;
; Originally Written by: Motorola Inc.
; Adapted to Apple/MPW: Jon Okada
;
; Copyright: © 1990, 1991 by Apple Computer, Inc., all rights reserved.
;
; This file is used in these builds: Mac32
;
; Change History (most recent first):
;
; <3> 6/26/91 BG Rolled in Jon's REM code change that speeds up the emulation for
; this instruction by a factor of 9.
; <2> 3/30/91 BG Rolling in Jon Okada's latest changes.
; <1> 12/14/90 BG First checked into TERROR/BBS.
; remmod.a
; Based upon Motorola file 'srem_mod.a'
; CHANGE LOG:
; 07 Jan 91 JPO Changed variable names Y and R to YMOD and RMOD. Changed
; constant label "Scale" to "SCREM". Renamed labels
; "HiX_not0" and "Finish" to "RMHiX_not0" and "RMFinish",
; respectively. Deleted unreferenced label "HiX_0".
; 25 Jun 91 JPO Rewrote to use "div32" subroutine which does up to
; 32 remainder steps at a time.
;
*
* srem_mod.sa 3.1 12/10/90
*
* The entry point sMOD computes the floating point MOD of the
* input values X and Y. The entry point sREM computes the floating
* point (IEEE) REM of the input values X and Y.
*
* INPUT
* -----
* Double-extended value Y is pointed to by address in register
* A0. Double-extended value X is located in -12(A0). The values
* of X and Y are both nonzero and finite; although either or both
* of them can be denormalized. The special cases of zeros, NaNs,
* and infinities are handled elsewhere.
*
* OUTPUT
* ------
* FREM(X,Y) or FMOD(X,Y), depending on entry point.
*
* ALGORITHM
* ---------
*
* Step 1. Save and strip signs of X and Y: signX := sign(X),
* signY := sign(Y), X := |X|, Y := |Y|,
* signQ := signX EOR signY. Record whether MOD or REM
* is requested.
*
* Step 2. Set L := expo(X)-expo(Y), k := 0, Q := 0.
* If (L < 0) then
* R := X, go to Step 4.
* else
* R := 2^(-L)X, j := L.
* endif
*
* Step 3. Perform MOD(X,Y)
* 3.1 If R = Y, go to Step 9.
* 3.2 If R > Y, then { R := R - Y, Q := Q + 1}
* 3.3 If j = 0, go to Step 4.
* 3.4 k := k + 1, j := j - 1, Q := 2Q, R := 2R. Go to
* Step 3.1.
*
* Step 4. At this point, R = X - QY = MOD(X,Y). Set
* Last_Subtract := false (used in Step 7 below). If
* MOD is requested, go to Step 6.
*
* Step 5. R = MOD(X,Y), but REM(X,Y) is requested.
* 5.1 If R < Y/2, then R = MOD(X,Y) = REM(X,Y). Go to
* Step 6.
* 5.2 If R > Y/2, then { set Last_Subtract := true,
* Q := Q + 1, Y := signY*Y }. Go to Step 6.
* 5.3 This is the tricky case of R = Y/2. If Q is odd,
* then { Q := Q + 1, signX := -signX }.
*
* Step 6. R := signX*R.
*
* Step 7. If Last_Subtract = true, R := R - Y.
*
* Step 8. Return signQ, last 7 bits of Q, and R as required.
*
* Step 9. At this point, R = 2^(-j)*X - Q Y = Y. Thus,
* X = 2^(j)*(Q+1)Y. set Q := 2^(j)*(Q+1),
* R := 0. Return signQ, last 7 bits of Q, and R.
*
*
* Copyright (C) Motorola, Inc. 1990
* All Rights Reserved
*
* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
* The copyright notice above does not evidence any
* actual or intended publication of such source code.
* SREM_MOD IDNT 2,1 Motorola 040 Floating Point Software Package
MOD_FLAG equ L_SCR3
;SignY equ FP_SCR3+4 ; DELETED <6/25/91, JPO> <T3>
;SignX equ FP_SCR3+8 ; DELETED <6/25/91, JPO> <T3>
;SignQ equ FP_SCR3+12 ; DELETED <6/25/91, JPO> <T3>
;Sc_Flag equ FP_SCR4 ; DELETED <6/25/91, JPO> <T3>
EXPOY equ FP_SCR3+4 ; <6/25/91, JPO>
;Y equ FP_SCR1 ; Renamed <1/7/91, JPO>
YMOD equ FP_SCR1 ; <1/7/91, JPO>
Y_Hi equ YMOD+4 ; <1/7/91, JPO>
Y_Lo equ YMOD+8 ; <1/7/91, JPO>
;R equ FP_SCR2 ; Renemed <1/7/91, JPO>
RMOD equ FP_SCR2 ; <1/7/91, JPO>
R_Hi equ RMOD+4 ; <1/7/91, JPO>
R_Lo equ RMOD+8 ; <1/7/91, JPO>
ALIGN 16 ; <1/7/91, JPO>
;SCREM DC.L $00010000,$80000000,$00000000,$00000000 ; Renamed <1/7/91, JPO> - DELETED <6/25/91, JPO>
; New FMOD and FREM algorithms begin here <6/25/91, JPO> <T3> thru next <T3>
smod:
move.l #0,MOD_FLAG(a6)
bra.b mod_rem
srem:
move.l #1,MOD_FLAG(a6)
mod_rem:
movem.l d2-d7/a2,-(sp) ; save data registers and A2
suba.l a2,a2 ; clear scaling flag (A2)
bfextu (a0){1:15},d3 ; |Y| in D0.w/D1/D2
move.l 8(A0),d5
move.l 4(a0),d4
bmi.b chk_x ; Y is normal
bne.b @1 ; normalize Y
move.l d5,d4 ; Y sig.hi is zero. normalize
clr.l d5
subi.l #32,d3
bfffo d4{0:32},d6
lsl.l d6,d4
sub.l d6,d3
bra.b chk_x
@1:
bfffo d4{0:32},d6 ; Y sig.hi is nonzero. normalize
sub.l d6,d3
lsl.l d6,d4
bfextu d5{0:d6},d7
lsl.l d6,d5
or.l d7,d4
chk_x:
bfextu -12(a0){1:15},d0 ; D0.w/D1/D2 is |x|
move.l -4(a0),d2
move.l -8(a0),d1
bmi.b mod_init ; X is normal
bne.b @1
move.l d2,d1 ; X sig.hi is zero. normalize
clr.l d2
subi.l #32,d0
bfffo d1{0:32},d6
lsl.l d6,d1
sub.l d6,d0
bra.b mod_init
@1:
bfffo d1{0:32},d6 ; X sig.hi is nonzero. normalize
sub.l d6,d0
lsl.l d6,d1
bfextu d2{0:d6},d7
lsl.l d6,d2
or.l d7,d1
mod_init:
move.l d3,EXPOY(a6) ; save expo(Y)
sub.l d3,d0 ; L = expo(X) - expo(Y)
bge.b @1 ; L >= 0
add.l d3,d0 ; L < 0 -> |x| < |y|. restore expo(X)
suba.l a1,a1 ; zero quotient in a1
bra.b get_mod
@1:
movea.l d0,a1 ; save L in a1
; Remainder algorithm using DIV32 subroutine handles large exponent
; differences much faster than any subtraction algorithm.
;
; D3/D1/D2: dividend/shifted remainder (96 bits)
; D4/D5: divisor (64 bits)
; D0/D6/D7: scratch
; A1: loop count
moveq #0,D3 ; zero high longword of dividend
andi.l #$0000001f,d0 ; alignment shift count
beq.b @2 ; if zero, do first DIV32
; shift dividend in d1/d2 left into d3/d1/d2
bfextu d1{0:d0},d3 ; shift high bits from d1 into d3
lsl.l d0,d1 ; shift d1 left
bfextu d2{0:d0},d6 ; extract d2 high bits
lsl.l d0,d2 ; shift d2 left
or.l d6,d1 ; insert extracted d2 bits into shifted d1
; Do initial division of D3/D1/D2 by D4/D5. 32-bit quotient in D0 with
; remainder in D3/D1 shifted by 32 bits.
@2:
bsr div32
; Remaining number of REM/MOD steps, if any, are done 32 at a time using DIV32.
; Final D0 value is lowest 32 bits of the quotient, and result is in D3/D1.
move.l a1,d6 ; get number of remaining 32-bit steps
bfextu d6{16:11},d6
beq.b moddivdone ; if zero, clean up
move.l d6,a1 ; a1 <- # of 32-bit steps
moveq #0,d2 ; zero trailing remainder bits
modlp32:
bsr div32
subq.l #1,a1
move.l a1,d6
bne.b modlp32
; At this point, remainder is in D3/D1 and exponent is in EXPOY(a6). Remainder
; may not be normalized. Low 32 bits of quotient in D0.
moddivdone:
move.l d1,d2 ; put result in D0/D1/D2
move.l d3,d1
move.l EXPOY(a6),d3 ; d3.w <- expoy
movea.l d0,a1 ; quotient in a1
move.l d3,d0 ; result expo = EXPOY
tst.l d1 ; test high significand
bne.b @1 ; nonzero
tst.l d2 ; test low significand
beq rem_is_0 ; zero result
move.l d2,d1 ; unnormalized with sig.hi zero
moveq #0,d2 ; normalize it
subi.l #32,d0
bfffo d1{0:32},d6
lsl.l d6,d1
sub.l d6,d0
bra.b get_mod
@1: ; sig.hi is nonzero
bmi.b get_mod ; normalized
bfffo d1{0:32},d6 ; unnormalized. Normalize it
sub.l d6,d0
lsl.l d6,d1
bfextu d2{0:d6},d7
lsl.l d6,d2
or.l d7,d1
; At this point, normalized remainder/mod in D0/D1/D2, low 32 bits of quotient
; in a1. D3/D4/D5 holds Y.
get_mod:
cmpi.l #64,d0 ; if result is too small, scale upward
bge.b no_scale ; large enough
addi.l #126,d0 ; scale result and Y upward by 2**126
addi.l #126,d3
addq.l #1,a2 ; flag scaled values
no_scale:
move.w d0,RMOD(a6) ; store result in RMOD
clr.w RMOD+2(a6)
move.l d1,RMOD+4(a6)
move.l d2,RMOD+8(a6)
move.w d3,YMOD(a6) ; store Y in YMOD
clr.w YMOD+2(a6)
move.l d4,YMOD+4(a6)
move.l d5,YMOD+8(a6)
fmove.x RMOD(a6),fp0 ; fp0 <- scaled RMOD
tst.l MOD_FLAG(a6) ; mod or rem?
beq.b fix_sign ; mod
subq.l #1,d3 ; rem. Is RMOD small enough?
cmp.l d3,d0
blt.b fix_sign ; yes
bgt.b last_sub ; no
cmp.l d4,d1 ; rem with expo(RMOD) = expo(YMOD) - 1
bne.b not_eq ; sig(RMOD) != sig(YMOD)
cmp.l d5,d2
beq.b tie_case ; rem with RMOD = YMOD/2 exactly
not_eq:
bcs.b fix_sign ; RMOD < YMOD/2
last_sub: ; rem with YMOD > RMOD > YMOD/2
fsub.x ymod(a6),fp0 ; last rem step (no exceptions)
addq.l #1,a1 ; increment quotient
fix_sign:
move.w -12(a0),d6 ; test sign of X
bpl.b @1 ; positive
fneg.x fp0 ; negative. negate fp0
@1:
move.w (a0),d7 ; d7.w <- sign/exp of Y
eor.w d7,d6
lsr.l #8,d6
andi.l #$00000080,d6 ; d6 bit 7 is sign of quotient
move.l a1,d7 ; get low 7 bits of quotient in d7
andi.l #$7f,d7
or.l d7,d6 ; OR in sign bit
swap d6
fmove.l FPSR,d7 ; insert quotient in FPSR
andi.l #$ff00ffff,d7
or.l d6,d7
fmove.l d7,FPSR
; Restore registers and round result
move.l a2,d0 ; d0 <- scaling flag
movem.l (a7)+,d2-d7/a2 ; restore registers
fmove.l USER_FPCR(a6),FPCR ; restore user's rounding
tst.l d0 ; scaling?
beq.b @2 ; no
fmul.s #"$00800000",fp0 ; scale by 2**(-126)
bra t_avoid_unsupp ; check for denorm as a
; result of the scaling
@2:
fmove.x fp0,fp0 ; capture rounding exceptions
rts ; done
; Result is zero
rem_is_0:
fmove.b #0,fp0 ; +0 in fp0
bra.b fix_sign
; rem result RMOD = YMOD/2 exactly
tie_case:
move.l a1,d6 ; check parity of quotient
andi.b #1,d6
beq.b fix_sign ; even
addq.l #1,a1 ; odd -> bump quotient
fneg.x fp0 ; and toggle sign of X in fp0
bra.b fix_sign
; Subroutine DIV32 calculates a 32-bit quotient from a 64-bit dividend
; and a 64-bit divisor and returns a shifted (by 32 bits) remainder.
;
; D3/D1 - dividend cum shifted remainder
; D2 - bits to be shifted into low half of remainder
; D4/D5 - divisor
; D0 - 32-bit quotient
; D6,D7 - scratch
div32:
divu.l d4,d3:d1 ; divide step (64 bits / 32 bits)
bvs.b divofl ; rare overflow case
move.l d1,d0 ; initialize quotient
move.l d1,d7 ; multiply quotient by divisor.low
mulu.l d5,d6:d7
ctndiv:
move.l d2,d1 ; shifted remainder in D3/D1
sub.l d7,d1 ; subtract correction from shifted remainder
subx.l d6,d3
bcc.b divok ; ok if no carry
onemore:
subq.l #1,d0 ; correction produced carry; decr quotient
add.l d5,d1 ; and adjust remainder upward until positive
addx.l d4,d3
bcc.b onemore
divok:
rts ; return
; rare case of division producing overflow. Fix up and continue
divofl:
move.l d5,d6 ; set D6/D7 to $100000000 * D5
moveq #0,d7
move.l d1,d3 ; simulate remainder for quotient of $100000000
moveq #0,d0 ; quotient effectively $100000000
bra.b ctndiv ; <T3>