1
0
mirror of https://github.com/elliotnunn/mac-rom.git synced 2025-03-13 00:31:04 +00:00

696 lines
33 KiB
Plaintext
Raw Permalink Normal View History

;
; File: BlockMove.a
;
; Contains: Here is the optimized Mac block move routine. It handles overlapping
; blocks by moving left or right when appropriate. It uses a MOVE.L
; loop when possible and uses a 12 register MOVEM.L loop for blocks
; longer than 124 bytes.
;
; Written by: Andy Hertzfeld
; Re-Written by: Gary Davidian
;
; Copyright: <09> 1982-1993 by Apple Computer, Inc., all rights reserved.
;
; Change History (most recent first):
;
; <SM5> 5/18/93 kc Roll in Gary's clean up and large overlap bug fix.
; <SM4> 4/23/93 kc Add "IF Supports24Bit" arround "and.l MaskBC,d1" in Block Move.
; <SM3> 11/10/92 CSS Update from Reality:
; <12> 10/23/92 pvh Reset d1 from d2 for byte copies less than 12 bytes (veryShort
; needs it)
; <11> 10/15/92 DTY Use D2 for the 12 byte or less check so we can preserve the trap
; word in D1, which is now checked to see if we<77>re doing
; BlockMove, or BlockMoveData. BlockMoveData, which doesn<73>t flush
; the cache, is signaled by having bit 9 (the immediate bit) set
; in the trap word.
; <SM2> 10/16/92 RB Removed the jCacheFlush call from the 68040 BlockMove, that code
; is never executed. (Horror has the same code, so don't bring it
; over!)
; <10> 2/12/92 JSM Moving to MemoryMgr directory, keeping all revisions.
; <9> 2/6/92 RB Fixed bug in 68040 version of BlockMove. The bug was introduced
; (by me) while moving the code from Terror into Reality.
; <8> 1/3/92 RB Rolled in 68040 version of BlockMove from Terror. It gets
; installed from StartInit.a when an 040 is present.
; <7> 10/18/91 JSM Remove 68000 versions.
; <6> 8/29/91 JSM Cleanup header.
; <5> 9/18/90 BG Removed <2>, <4>. 040s are behaving more reliably now.
; <4> 8/3/90 BG Added some EclipseNOPs for flakey 040s. Currently 040s require
; ANY instruction to separate two adjacent MOVEMs.
; <3> 7/17/90 dba change name of BlockMove routine so it does not conflict with
; the BlockMove glue
; <2> 6/18/90 CCH Added NOPs for flaky 68040's.
; <1.6> 7/15/89 GGD Added code alignment for better burst performance.
; <1.5> 2/22/89 GGD Made the 68020 version work in 32 bit mode as well as 24 bit
; mode, and work with move counts greater than 2**24.
; <1.4> 2/20/89 rwh re-spelled conditional in comment, won't show up in searches.
; <1.3> 12/6/88 GGD Fixed a incorrect register bug which in the 68000 decrementing
; path.
; <1.2> 11/17/88 GGD Re-written, and optimized, although algorithms still basicly the
; same for 68000 machines. Now the decrementing copy loops are
; only used if there is overlap between the source and dest, since
; the incrementing address modes are faster on the 68000, also
; optimized the MOVEM loop. 68000 version also includes the 68020
; version, and the correct version is chosen at start time based
; upon CpuFlag. This way accelerated machines will get faster
; moves and correct cache flushing. For 68020 got rid of single
; byte at a time move loop because the 020 can read words from odd
; addresses. Also longword aligned the destination to reduce bus
; cycles. Special cased very short (up to 12 byte) copies to read
; the entire source, and then write it out so that overlap need
; not be checked, and misalignment doesn't cost too much. It now
; only flushes the instruction cache when the length of the move
; is greater than 12 bytes.
; <1.1> 11/10/88 CCH Fixed Header.
; <1.0> 11/9/88 CCH Adding to EASE.
; <<3C>1.1> 9/23/88 CCH Got rid of inc.sum.d and empty nFiles
; <1.0> 2/10/88 BBM Adding file for the first time into EASE<53>
; <Cxxx> 10/16/87 rwh Port to Modern Victorian (onMvMac)
; <C690> 1/24/87 JTC Improvements for 020. With new longword alignment in 020
; <C668> 1/22/87 bbm made the code which flushed the cache a external vector.
; <C482> 12/4/86 bbm The code to flush the cache in blockmove needed to be set in
; conditionals for NuMac. <1.4>
; <C456> 11/22/86 bbm moved the code to flush the cache into blockmove, loadseg,
; unloadseg, and read. this might improve performance.
; <C206> 10/9/86 bbm Made file use mpw aincludes.
; 2/19/86 BBM Made some modifications to work under MPW
; 4/23/85 SC Insure D0=0 upon exit(I told you there was an bug)
; 4/20/85 JTC Added .DEF for routine name!
; 4/16/85 SC Rewrote with no space consideration and new blockmove
; statistics. On the average, this is 30% faster than old one.
; 1/29/85 EHB Check for negative lengths too!!
; 1/23/85 LAK Adapted for new equate files.
; 8/18/83 JTC Hacked for space by JTC
; 3/9/83 AJH Fixed bug by making it add long for moving right
; 10/31/82 AJH Integrated for ROM
; 8/26/82 AJH Modified it to support blocks > 64K
; 5/19/82 AJH fixed bug Malloy found in cleaning up after MOVEM loop
; 5/12/82 AJH re-organized things to make it cleaner
;
;
;_______________________________________________________________________
;
; BlockMove(SrcPtr,DstPtr: Ptr; nBytes: INTEGER);
;
; Here is the optimized Mac block move routine. It handles overlapping
; blocks by moving left or right when appropriate. It uses a MOVE.L
; loop when possible and uses a 12 register MOVEM.L loop for blocks
; longer than 124 bytes.
;
; It uses a register interface. A0 = source, A1 = destination, D0 = count.
; The addresses are firsted masked with $00FFFFFF.
;
; Register mask during computation:
; D0 = count
; D1 = destination pointer - source pointer
; A0 = source pointer
; A1 = destination pointer
;
; written by Andy Hertzfeld May 10, 1982
; re-written by Gary Davidian Nov 13, 1988
;
; Copyright Apple Computer, Inc. 1982-1989
; All Rights Reserved
;
; Ancient Modification History: (for historical purposes only, does not correspond to this code)
;
; 12-May-82 AJH re-organized things to make it cleaner
; 19-May-82 AJH fixed bug Malloy found in cleaning up after MOVEM loop
; 26-Aug-82 AJH Modified it to support blocks > 64K
; 31-Oct-82 AJH Integrated for ROM
; 09-Mar-83 AJH Fixed bug by making it add long for moving right
; 18-Aug-83 JTC Hacked for space by JTC
;
;_______________________________________________________________________
; 23 Jan 85 LAK Adapted for new equate files.
; 29-Jan-85 EHB Check for negative lengths too!!
; 16-Apr-85 SC Rewrote with no space consideration and new blockmove
; statistics. On the average, this is 30% faster than
; old one.
; 20 Apr 85 JTC Added .DEF for routine name!
; 23 Apr 85 SC Insure D0=0 upon exit(I told you there was an bug)
;_______________________________________________________________________
;
; Post Lonely Hearts
;_______________________________________________________________________
;
; <19feb86> BBM Made some modifications to work under MPW
;<C206/09oct86> bbm Made file use mpw aincludes.
;<C456/22nov86> bbm moved the code to flush the cache into blockmove, loadseg,
; unloadseg, and read. this might improve performance.
;<C482/04dec86> bbm The code to flush the cache in blockmove needed to be set
; in conditionals for NuMac. <1.4>
;<C668/22jan87> bbm made the code which flushed the cache a external vector.
;<C690/24jan87> JTC Improvements for 020. With new longword alignment in 020
; memory managers, it<69>s worthwhile to take advantage of the fastest possible
; move, an unrolled dbra loop of MOVE.Ls. We arbitrarily choose 16 moves,
; since the dbra overhead looks like 6 cycles based on work with Ron H.
; The 16 cases of interest are moves from (4N+K) to (4M+J) where M and
; N are nonnegative and 0 <20> K,J <20> 3. As before, lump all even/odd cases
; into one big dbra thrash by bytes.
;<Cxxx/16oct87> rwh Port to Modern Victorian (onMvMac)
;_______________________________________________________________________
; Interesting numbers from psuedo-random sampling:
; 80%+ calls are for 1-31 bytes
; 95%+ calls are for less than 256 bytes
; On a 512K Mac, 20% of the calls come from memory manager
; On a 128K Mac, 40% of the calls come from memory manager
; => this probably should be JSR'ed to from Memory Manager
print off
LOAD 'StandardEqu.d'
print on
print nomdir
machine mc68040
BlockMoves proc
export __BlockMove ; Default version
export BlockMove68020 ; 68020 version (flushes cache too)
EXPORT BlockMove68040 ; 68040 version <8> rb
eject
align alignment
Loop16 move16 (a0)+,(a1)+ ; move 32 bytes, 16 at a time
sub.l d2,d0 ; adjust for the 32 bytes just moved
move16 (a0)+,(a1)+
bge.s Loop16 ; loop until count is -32<33>-1
jmp CopyTailInc(d0.w*2) ; copy the remaining bytes
Title 'BlockMove - Copy Tail Incrementing'
;_______________________________________________________________________
;
; Routine: CopyTailInc
; Inputs: A0 - source address
; A1 - destination address
; Outputs: D0 - error code (noErr)
; Destroys: A0, A1
;
; Function: Copy up to 31 bytes in incrementing address order using a direct
; sequence of moves. This routine returns to the BlockMove caller
; with D0=noErr.
;
; Calling Convention:
; D0 is setup with size-32, so that moving 0<>31 bytes => d0 = -32<33>-1
; The trick is to double D0 and use it as an index into a table of
; branches to the appropriate code. Thanks to Steve Capps for all this.
;
; 68000 add.w d0,d0 68020 jmp CopyTailInc(d0.w*2)
; jmp CopyTailInc(d0.w)
;
;_______________________________________________________________________
TailInc30 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 30 bytes Incrementing
TailInc26 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 26 bytes Incrementing
TailInc22 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 22 bytes Incrementing
TailInc18 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 18 bytes Incrementing
TailInc14 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 14 bytes Incrementing
TailInc10 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 10 bytes Incrementing
TailInc06 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 6 bytes Incrementing
TailInc02 move.w (a0)+,(a1)+ ; 12 1 1 1 copy final 2 bytes Incrementing
moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
bra.s TailInc00 ; 10 2 0 0 copy final 0 bytes Incrementing
bra.s TailInc01 ; 10 2 0 0 copy final 1 byte Incrementing
bra.s TailInc02 ; 10 2 0 0 copy final 2 bytes Incrementing
bra.s TailInc03 ; 10 2 0 0 copy final 3 bytes Incrementing
bra.s TailInc04 ; 10 2 0 0 copy final 4 bytes Incrementing
bra.s TailInc05 ; 10 2 0 0 copy final 5 bytes Incrementing
bra.s TailInc06 ; 10 2 0 0 copy final 6 bytes Incrementing
bra.s TailInc07 ; 10 2 0 0 copy final 7 bytes Incrementing
bra.s TailInc08 ; 10 2 0 0 copy final 8 bytes Incrementing
bra.s TailInc09 ; 10 2 0 0 copy final 9 bytes Incrementing
bra.s TailInc10 ; 10 2 0 0 copy final 10 bytes Incrementing
bra.s TailInc11 ; 10 2 0 0 copy final 11 bytes Incrementing
bra.s TailInc12 ; 10 2 0 0 copy final 12 bytes Incrementing
bra.s TailInc13 ; 10 2 0 0 copy final 13 bytes Incrementing
bra.s TailInc14 ; 10 2 0 0 copy final 14 bytes Incrementing
bra.s TailInc15 ; 10 2 0 0 copy final 15 bytes Incrementing
bra.s TailInc16 ; 10 2 0 0 copy final 16 bytes Incrementing
bra.s TailInc17 ; 10 2 0 0 copy final 17 bytes Incrementing
bra.s TailInc18 ; 10 2 0 0 copy final 18 bytes Incrementing
bra.s TailInc19 ; 10 2 0 0 copy final 19 bytes Incrementing
bra.s TailInc20 ; 10 2 0 0 copy final 20 bytes Incrementing
bra.s TailInc21 ; 10 2 0 0 copy final 21 bytes Incrementing
bra.s TailInc22 ; 10 2 0 0 copy final 22 bytes Incrementing
bra.s TailInc23 ; 10 2 0 0 copy final 23 bytes Incrementing
bra.s TailInc24 ; 10 2 0 0 copy final 24 bytes Incrementing
bra.s TailInc25 ; 10 2 0 0 copy final 25 bytes Incrementing
bra.s TailInc26 ; 10 2 0 0 copy final 26 bytes Incrementing
bra.s TailInc27 ; 10 2 0 0 copy final 27 bytes Incrementing
bra.s TailInc28 ; 10 2 0 0 copy final 28 bytes Incrementing
bra.s TailInc29 ; 10 2 0 0 copy final 29 bytes Incrementing
bra.s TailInc30 ; 10 2 0 0 copy final 30 bytes Incrementing
TailInc31 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 31 bytes Incrementing
CopyTailInc ; copy final 0<>31 bytes Incrementing
TailInc27 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 27 bytes Incrementing
TailInc23 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 23 bytes Incrementing
TailInc19 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 19 bytes Incrementing
TailInc15 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 15 bytes Incrementing
TailInc11 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 11 bytes Incrementing
TailInc07 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 7 bytes Incrementing
TailInc03 move.w (a0)+,(a1)+ ; 12 1 1 1 copy final 3 bytes Incrementing
TailInc01 move.b (a0)+,(a1)+ ; 12 1 1 1 copy final 1 byte Incrementing
moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
TailInc28 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 28 bytes Incrementing
TailInc24 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 24 bytes Incrementing
TailInc20 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 20 bytes Incrementing
TailInc16 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 16 bytes Incrementing
TailInc12 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 12 bytes Incrementing
TailInc08 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 8 bytes Incrementing
TailInc04 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 4 bytes Incrementing
TailInc00 moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
TailInc29 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 29 bytes Incrementing
TailInc25 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 25 bytes Incrementing
TailInc21 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 21 bytes Incrementing
TailInc17 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 17 bytes Incrementing
TailInc13 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 13 bytes Incrementing
TailInc09 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 9 bytes Incrementing
TailInc05 move.l (a0)+,(a1)+ ; 20 1 2 2 copy final 5 bytes Incrementing
move.b (a0)+,(a1)+ ; 12 1 1 1 copy final 1 byte Incrementing
moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
Title 'BlockMove - Copy Incrementing 68020 / 68030'
align alignment ; <1.6>
CopyInc68020
; Check to see if the source and destination overlap such that a copy using
; incrementing addresses would cause modification of the source, in which case
; we must copy starting at the end of the fields, using decrementing addresses.
move.l jCacheFlush,-(sp) ; flush the instruction cache when we exit
move.l a1,d1 ; get the destination address
moveq.l #-4,d2 ; setup mask for longword alignment
or.l d1,d2 ; d2= -1<>-4, number of bytes to align
sub.l a0,d1 ; d1 := dest - src
IF Supports24Bit THEN
; Mask the address difference to 24 bits before comparing to length, this is needed for
; 24 bit mode where the upper byte of the addresses may have flags. This even works in
; 32 bit mode as long as the byte count doesn't exceed 24 bits, although in a 32 bit only
; system, this masking can be eliminated.
andi.l #$00FFFFFF,d1 ; strip off the high byte for 24 bit mode
cmp.l d0,d1 ; see if dest is before end of src
blo.s overlap ; if so, fields overlap, must copy decrementing addrs
ELSE
cmp.l d0,d1 ; see if dest is before end of src
blo.w CopyDec68020 ; if so, fields overlap, must copy decrementing addrs
ENDIF
; Align the destination to a longword boundary to reduce bus cycles. On a 68030
; with the data cache, the unaligned reads will cache, so that the same long word
; will not need to be read from RAM more than once. At this point we also know that
; we are moving more than 12 bytes, so that the 0<>3 bytes of alignment will not
; exceed the length.
moveLongs ; <8> rb
jmp align(d2.w*2) ; jump to the alignment routine <8> rb
bra.s aligned ; -4, already longword aligned <8> rb
move.b (a0)+,(a1)+ ; -3, move 3 bytes to align
move.b (a0)+,(a1)+ ; -2, move 2 bytes to align
move.b (a0)+,(a1)+ ; -1, move 1 byte to align
align add.l d2,d0 ; adjust the byte count after alignment <8> rb
aligned moveq.l #32,d2 ; byte count adjustment, 32 bytes at a time <8> rb
sub.l d2,d0 ; setup for CopyTailInc
bge.s longsLoop ; if 32 or more bytes left, use longword loop <8> rb
jmp CopyTailInc(d0.w*2) ; copy the remaining bytes
longsLoop move.l (a0)+,(a1)+ ; move 32 bytes, 4 at a time <8> rb
move.l (a0)+,(a1)+ ; (using a DBRA in this loop would save
move.l (a0)+,(a1)+ ; 2 clocks per loop, but would incur
move.l (a0)+,(a1)+ ; overhead outside of the loop that would
move.l (a0)+,(a1)+ ; slow down the shorter and more frequent
move.l (a0)+,(a1)+ ; cases)
move.l (a0)+,(a1)+ ;
move.l (a0)+,(a1)+ ;
sub.l d2,d0 ; adjust for the 32 bytes just moved
bge.s longsLoop ; loop until count is -32<33>-1 <8> rb
jmp CopyTailInc(d0.w*2) ; copy the remaining bytes
IF Supports24Bit THEN
overlap cmpi.l #$007FFFFF,d0 ; see if length > 23 bits
bls.w CopyDec68020 ; if small len, don't care about address mode
; If length > 2**23 bits, we may be in 32 bit mode, and it is possible that <1.5>
; the source and destination differ by more than 2**23. In which case the <1.5>
; masked address difference above would not be accurate for detecting overlap.<1.5>
move.l a1,-(sp) ; push dst address
move.l a0,-(sp) ; push src address
btst.b #Systemis24bit,SystemInfo ; are we in 24 bit mode?
beq.s @32bit ; nope
clr.b 4(sp) ; strip dst address
clr.b (sp) ; strip src address
@32bit cmpm.l (sp)+,(sp)+ ; see if source <= dest (dec move needed)
bhs.w CopyDec68020 ; if so, start copy decrementing addrs <1.5>
jmp align(d2.w*2) ; go to the incr. alignment routine <1.5> <8> rb
ENDIF
Title 'BlockMove - 68020 / 68030 Block Move'
;_______________________________________________________________________
;
; Routine: BlockMove68020
; Inputs: A0 - source address
; A1 - destination address
; D0 - byte count
; Outputs: D0 - error code (noErr)
; Destroys: A0, A1, D1, D2
;
; Function: 68020/68030 block move routine, checks source, destination, and
; length to determine if the fields overlap in such a way that
; the direction of the copy should be changed to start at the
; end of the field use decrementing addresses. The default
; and preferred order is to copy from the beginning of the
; fields and use incrementing addresses. There is also a special
; case for moves of up to 12 bytes, in which case we read the entire
; source field into registers before writing it to the destination
; so that we don't need to check for overlap.
;
;_______________________________________________________________________
align alignment ; <1.6>
__BlockMove ; ENTER HERE
BlockMove68020 ; ENTER HERE
moveq.l #-12,d2 ; special case length is 1<>12
add.l d0,d2 ; see if length <= 12
bgt.s CopyInc68020 ; if not, normal case, try incrementing first
veryShort tst.l d0 ; check byte count <8> rb
ble.s @done ; if count was negative or zero, we're done
; if count was 1<>12, we can read the entire source into registers and then write it
; out to the destination without having to worry about overlap, since all reads are
; completed before any writes start. Since the count is so small, it's unlikely that
; we are moving code, so to improve performance, don't flush the instruction cache.
lsl.w #3,d0 ; convert byte count to bit count
addq.l #4,d2 ; see if 1<>8 / 9<>12
ble.s @short1to8 ; if count <= 8, check for 1<>4 / 5<>8
@short9to12 addq.l #8,a0 ; point to final 1<>4 bytes from source
bfextu (a0){0:d0},d1 ; read final 1<>4 bytes from source
move.l -(a0),d2 ; read middle 4 bytes from source
move.l -(a0),(a1)+ ; copy first 4 bytes from source to dest
@done5to8 move.l d2,(a1)+ ; write middle (or first) 4 bytes to destination
@done1to4 bfins d1,(a1){0:d0} ; write final 1<>4 bytes to destination
@done moveq.l #noErr,d0 ; return success status
rts ; _BlockMove complete (don't flush cache)
@short1to8 addq.l #4,d2 ; see if 1<>4 / 5<>8
ble.s @short1to4 ; if count <= 4, go move it
@short5to8 move.l (a0)+,d2 ; read first 4 bytes from source
bfextu (a0){0:d0},d1 ; read final 1<>4 bytes from source
bra.s @done5to8 ; write to destination and exit
@short1to4 bfextu (a0){0:d0},d1 ; read 1<>4 bytes from source
bra.s @done1to4 ; write to destination and exit
Title 'BlockMove - 68040 MOVE16 optimizations'
; <8> rb
;========================== 68040 BlockMove =========================
; Terror History of 68040 BlockMove:
;
;
; <7> 5/10/91 RP Rolled in GGDs MOVE16 BlockMove patch.
; <6> 4/25/91 CCH Removed NOPs, since the bug was the unsetup DFC, not the 68040.
; <5> 4/25/91 CCH Set DFC register before doing a PTEST.
; <4> 4/24/91 CCH Added a NOP in front of the CPUSHL instructions since they
; currently don't always work in the D43B mask set of 68040's.
; <3> 4/21/91 CCH Fixed BlockMove patch to convert addresses from logical to
; physical when using CPUSHL to flush.
; <2> 4/2/91 CCH Don't optimize if VM is on.
; <1> 4/2/91 CCH first checked in
;_______________________________________________________________________
;
; Routine: BlockMove68040
; Inputs: A0 - source address
; A1 - destination address
; D0 - byte count
; D1 - trap word: Don<6F>t flush the cache if immediate bit is set.
; Outputs: D0 - error code (noErr)
; Destroys: A0, A1, D1, D2
;
; Function: 68040 block move routine. If length > 12 bytes, sets up
; cache flushing tail routine. Otherwise, tries to use MOVE16
; if possible. If MOVE16 not possible, uses standard 68030
; MOVE.L loops.
;
;_______________________________________________________________________
align alignment
IMPORT FlushCRangeForBM
BlockMove68040 ; ENTER HERE
moveq.l #-12,d2 ; special case length is 1<>12 <11> Use D2 to preserve trap word in D1
add.l d0,d2 ; see if length <= 12 <11> Use D2 to preserve trap word in D1
ble.s veryShort ; use fast path if length <= 12
CopyInc68040
btst #noQueueBit,d1 ; <11> Check to see if the immediate bit is set.
bnz.s @checkAlignment ; <11> If it<69>s set, don<6F>t flush the cache on exit
move.l d0,-(sp) ; Count parameter for FlushCRange
move.l a1,-(sp) ; Address parameter for FlushCRange
bsr.s @checkAlignment ; move the data
bra.l FlushCRangeForBM ; flush the caches
@checkAlignment
; Check to see if the source and destination overlap such that a copy using
; incrementing addresses would cause modification of the source, in which case
; we must copy starting at the end of the fields, using decrementing addresses.
move.l a1,d1 ; get the destination address
moveq.l #-4,d2 ; setup mask for longword alignment
or.l d1,d2 ; d2= -1<>-4, number of bytes to align
sub.l a0,d1 ; d1 := dest - src
IF Supports24Bit THEN
; Mask the address difference to 24 bits before comparing to length, this is needed for
; 24 bit mode where the upper byte of the addresses may have flags. This even works in
; 32 bit mode as long as the byte count doesn't exceed 24 bits, although in a 32 bit only
; system, this masking can be eliminated.
andi.l #$00FFFFFF,d1 ; strip off the high byte for 24 bit mode
cmp.l d0,d1 ; see if dest is before end of src
blo.w overlap ; if so, fields overlap, must copy decrementing addrs
ELSE
cmp.l d0,d1 ; see if dest is before end of src
blo.w CopyDec68020 ; if so, fields overlap, must copy decrementing addrs
ENDIF
; Align the destination to a longword boundary to reduce bus cycles. On a 68030
; with the data cache, the unaligned reads will cache, so that the same long word
; will not need to be read from RAM more than once. At this point we also know that
; we are moving more than 12 bytes, so that the 0<>3 bytes of alignment will not
; exceed the length.
cmpi.l #47,d0 ; see if long enough
blo.w moveLongs ; if not, don't even think of Move16s
andi.b #$0F,d1 ; see if 16 byte relative alignment
bne.w moveLongs ; if not, can't use Move16
; Align the source / destination to a 16 byte boundary to use MOVE16. At this
; point we also know that we are moving at least 47 bytes, so that the 0<>15 bytes
; of alignment will not exceed the length.
@UseMove16 move.l a1,d1 ; get the destination address
neg.l d1 ; convert to byte count
moveq.l #15,d2 ; setup mask for longword alignment
and.l d1,d2 ; d2= 0<>15, number of bytes to align
beq.s @Aligned16 ; exit if no alignment needed
sub.l d2,d0 ; update byte count
lsr.l #1,d2 ; test bit 0 of alignment count
bcc.s @Aligned2 ; skip if already byte aligned
move.b (a0)+,(a1)+ ; move 1 byte to force word alignment
@Aligned2 lsr.l #1,d2 ; test bit 1 of alignment count
bcc.s @Aligned4 ; skip if already long aligned
move.w (a0)+,(a1)+ ; move 1 word to force long alignment
@Aligned4 lsr.l #1,d2 ; test bit 2 of alignment count
bcc.s @Aligned8 ; skip if already long aligned
move.l (a0)+,(a1)+ ; move 1 long to force double alignment
tst.l d2 ; test bit 3 of alignment count
@Aligned8 beq.s @Aligned16 ; skip if already long aligned
move.l (a0)+,(a1)+ ; move 1 double to force quad alignment
move.l (a0)+,(a1)+
@Aligned16 moveq.l #32,d2 ; byte count adjustment, 32 bytes at a time
sub.l d2,d0 ; setup for CopyTailInc
bclr.l #4,d0 ; see if tail >= 16 bytes
nop ; sync the pipeline for defective 68040s
beq.w Loop16 ; if not, start copy
move16 (a0)+,(a1)+ ; extra move16 to reduce tail size
bra.w Loop16 ; align the loop on a cache line
Title 'BlockMove - Copy Tail Decrementing'
;_______________________________________________________________________
;
; Routine: CopyTailDec
; Inputs: A0 - source address+1
; A1 - destination address+1
; Outputs: D0 - error code (noErr)
; Destroys: A0, A1
;
; Function: Copy up to 31 bytes in decrementing address order using a direct
; sequence of moves. This routine returns to the BlockMove caller
; with D0=noErr.
;
; Calling Convention:
; D0 is setup with size-32, so that moving 0<>31 bytes => d0 = -32<33>-1
; The trick is to double D0 and use it as an index into a table of
; branches to the appropriate code. Thanks to Steve Capps for all this.
;
; 68000 add.w d0,d0 68020 jmp CopyTailDec(d0.w*2)
; jmp CopyTailDec(d0.w)
;
;_______________________________________________________________________
align alignment ; <1.6>
TailDec30 move.l -(a0),-(a1) ; 22 1 2 2 copy final 30 bytes Decrementing
TailDec26 move.l -(a0),-(a1) ; 22 1 2 2 copy final 26 bytes Decrementing
TailDec22 move.l -(a0),-(a1) ; 22 1 2 2 copy final 22 bytes Decrementing
TailDec18 move.l -(a0),-(a1) ; 22 1 2 2 copy final 18 bytes Decrementing
TailDec14 move.l -(a0),-(a1) ; 22 1 2 2 copy final 14 bytes Decrementing
TailDec10 move.l -(a0),-(a1) ; 22 1 2 2 copy final 10 bytes Decrementing
TailDec06 move.l -(a0),-(a1) ; 22 1 2 2 copy final 6 bytes Decrementing
TailDec02 move.w -(a0),-(a1) ; 14 1 1 1 copy final 2 bytes Decrementing
moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
bra.s TailDec00 ; 10 2 0 0 copy final 0 bytes Decrementing
bra.s TailDec01 ; 10 2 0 0 copy final 1 byte Decrementing
bra.s TailDec02 ; 10 2 0 0 copy final 2 bytes Decrementing
bra.s TailDec03 ; 10 2 0 0 copy final 3 bytes Decrementing
bra.s TailDec04 ; 10 2 0 0 copy final 4 bytes Decrementing
bra.s TailDec05 ; 10 2 0 0 copy final 5 bytes Decrementing
bra.s TailDec06 ; 10 2 0 0 copy final 6 bytes Decrementing
bra.s TailDec07 ; 10 2 0 0 copy final 7 bytes Decrementing
bra.s TailDec08 ; 10 2 0 0 copy final 8 bytes Decrementing
bra.s TailDec09 ; 10 2 0 0 copy final 9 bytes Decrementing
bra.s TailDec10 ; 10 2 0 0 copy final 10 bytes Decrementing
bra.s TailDec11 ; 10 2 0 0 copy final 11 bytes Decrementing
bra.s TailDec12 ; 10 2 0 0 copy final 12 bytes Decrementing
bra.s TailDec13 ; 10 2 0 0 copy final 13 bytes Decrementing
bra.s TailDec14 ; 10 2 0 0 copy final 14 bytes Decrementing
bra.s TailDec15 ; 10 2 0 0 copy final 15 bytes Decrementing
bra.s TailDec16 ; 10 2 0 0 copy final 16 bytes Decrementing
bra.s TailDec17 ; 10 2 0 0 copy final 17 bytes Decrementing
bra.s TailDec18 ; 10 2 0 0 copy final 18 bytes Decrementing
bra.s TailDec19 ; 10 2 0 0 copy final 19 bytes Decrementing
bra.s TailDec20 ; 10 2 0 0 copy final 20 bytes Decrementing
bra.s TailDec21 ; 10 2 0 0 copy final 21 bytes Decrementing
bra.s TailDec22 ; 10 2 0 0 copy final 22 bytes Decrementing
bra.s TailDec23 ; 10 2 0 0 copy final 23 bytes Decrementing
bra.s TailDec24 ; 10 2 0 0 copy final 24 bytes Decrementing
bra.s TailDec25 ; 10 2 0 0 copy final 25 bytes Decrementing
bra.s TailDec26 ; 10 2 0 0 copy final 26 bytes Decrementing
bra.s TailDec27 ; 10 2 0 0 copy final 27 bytes Decrementing
bra.s TailDec28 ; 10 2 0 0 copy final 28 bytes Decrementing
bra.s TailDec29 ; 10 2 0 0 copy final 29 bytes Decrementing
bra.s TailDec30 ; 10 2 0 0 copy final 30 bytes Decrementing
TailDec31 move.l -(a0),-(a1) ; 22 1 2 2 copy final 31 bytes Decrementing
CopyTailDec ; copy final 0<>31 bytes Decrementing
TailDec27 move.l -(a0),-(a1) ; 22 1 2 2 copy final 27 bytes Decrementing
TailDec23 move.l -(a0),-(a1) ; 22 1 2 2 copy final 23 bytes Decrementing
TailDec19 move.l -(a0),-(a1) ; 22 1 2 2 copy final 19 bytes Decrementing
TailDec15 move.l -(a0),-(a1) ; 22 1 2 2 copy final 15 bytes Decrementing
TailDec11 move.l -(a0),-(a1) ; 22 1 2 2 copy final 11 bytes Decrementing
TailDec07 move.l -(a0),-(a1) ; 22 1 2 2 copy final 7 bytes Decrementing
TailDec03 move.w -(a0),-(a1) ; 14 1 1 1 copy final 3 bytes Decrementing
TailDec01 move.b -(a0),-(a1) ; 14 1 1 1 copy final 1 byte Decrementing
moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
TailDec28 move.l -(a0),-(a1) ; 22 1 2 2 copy final 28 bytes Decrementing
TailDec24 move.l -(a0),-(a1) ; 22 1 2 2 copy final 24 bytes Decrementing
TailDec20 move.l -(a0),-(a1) ; 22 1 2 2 copy final 20 bytes Decrementing
TailDec16 move.l -(a0),-(a1) ; 22 1 2 2 copy final 16 bytes Decrementing
TailDec12 move.l -(a0),-(a1) ; 22 1 2 2 copy final 12 bytes Decrementing
TailDec08 move.l -(a0),-(a1) ; 22 1 2 2 copy final 8 bytes Decrementing
TailDec04 move.l -(a0),-(a1) ; 22 1 2 2 copy final 4 bytes Decrementing
TailDec00 moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
TailDec29 move.l -(a0),-(a1) ; 22 1 2 2 copy final 29 bytes Decrementing
TailDec25 move.l -(a0),-(a1) ; 22 1 2 2 copy final 25 bytes Decrementing
TailDec21 move.l -(a0),-(a1) ; 22 1 2 2 copy final 21 bytes Decrementing
TailDec17 move.l -(a0),-(a1) ; 22 1 2 2 copy final 17 bytes Decrementing
TailDec13 move.l -(a0),-(a1) ; 22 1 2 2 copy final 13 bytes Decrementing
TailDec09 move.l -(a0),-(a1) ; 22 1 2 2 copy final 9 bytes Decrementing
TailDec05 move.l -(a0),-(a1) ; 22 1 2 2 copy final 5 bytes Decrementing
move.b -(a0),-(a1) ; 14 1 1 1 copy final 1 byte Decrementing
moveq.l #noErr,d0 ; 4 1 0 0 return success status
rts ; 16 2 2 0 _BlockMove complete
Title 'BlockMove - Copy Decrementing 68020 / 68030'
; Align the destination to a longword boundary to reduce bus cycles. On a 68030
; with the data cache, the unaligned reads will cache, so that the same long word
; will not need to be read from RAM more than once. At this point we also know that
; we are moving more than 12 bytes, so that the 0<>3 bytes of alignment will not
; exceed the length.
align alignment ; <1.6>
CopyDec68020
cmpa.l a0,a1 ; see if source=dest (no move needed)
beq.s TailDec00 ; if exactly the same, don't do the copy
adda.l d0,a0 ; point past end of source
adda.l d0,a1 ; point past end of destination
move.l a1,d1 ; get the destination address
moveq.l #3,d2 ; setup mask for longword alignment
and.l d1,d2 ; d2= 0<>3, number of bytes to align
beq.s @aligned ; if already aligned, skip alignment
neg.l d2 ; negate to index backwards
jmp @align(d2.w*2) ; jump to the alignment routine
move.b -(a0),-(a1) ; -3, move 3 bytes to align
move.b -(a0),-(a1) ; -2, move 2 bytes to align
move.b -(a0),-(a1) ; -1, move 1 byte to align
@align add.l d2,d0 ; adjust the byte count after alignment
@aligned moveq.l #32,d2 ; byte count adjustment, 32 bytes at a time
sub.l d2,d0 ; setup for CopyTailDec
bge.s @longsLoop ; if 32 or more bytes left, use longword loop
jmp CopyTailDec(d0.w*2) ; copy the remaining bytes
@longsLoop move.l -(a0),-(a1) ; move 32 bytes, 4 at a time
move.l -(a0),-(a1) ; (using a DBRA in this loop would save
move.l -(a0),-(a1) ; 2 clocks per loop, but would incur
move.l -(a0),-(a1) ; overhead outside of the loop that would
move.l -(a0),-(a1) ; slow down the shorter and more frequent
move.l -(a0),-(a1) ; cases)
move.l -(a0),-(a1) ;
move.l -(a0),-(a1) ;
sub.l d2,d0 ; adjust for the 32 bytes just moved
bge.s @longsLoop ; loop until count is -32<33>-1
jmp CopyTailDec(d0.w*2) ; copy the remaining bytes
ENDP
END