Retro68/gcc/newlib/libc/machine/h8300/memcpy.S
2012-03-27 01:51:53 +02:00

150 lines
3.2 KiB
ArmAsm

#include "setarch.h"
#include "defines.h"
#ifdef __H8300SX__
.global _memcpy
_memcpy:
stm.l er4-er6,@-er7
; Set up source and destination pointers for movmd.
mov.l er0,er6
mov.l er1,er5
; See whether the copy is long enough to use the movmd.l code.
; Although the code can handle anything longer than 6 bytes,
; it can be more expensive than movmd.b for small moves.
; It's better to use a higher threshold to account for this.
;
; Note that the exact overhead of the movmd.l checks depends on
; the alignments of the length and pointers. They are faster when
; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
; are 0. This threshold is a compromise between the various cases.
cmp #16,LEN(r2)
blo simple
; movmd.l only works for even addresses. If one of the addresses
; is odd and the other is not, fall back on a simple move.
bld #0,r5l
bxor #0,r6l
bcs simple
; Make the addresses even.
bld #0,r5l
bcc word_aligned
mov.b @er5+,@er6+
sub #1,LEN(r2)
word_aligned:
; See if copying one word would make the first operand longword
; aligned. Although this is only really worthwhile if it aligns
; the second operand as well, it's no worse if doesn't, so it
; hardly seems worth the overhead of a "band" check.
bld #1,r6l
bcc fast_copy
mov.w @er5+,@er6+
sub #2,LEN(r2)
fast_copy:
; Set (e)r4 to the number of longwords to copy.
mov LEN(r2),LEN(r4)
shlr #2,LEN(r4)
#ifdef __NORMAL_MODE__
; 16-bit pointers and size_ts: one movmd.l is enough. This code
; is never reached with r4 == 0.
movmd.l
and.w #3,r2
simple:
mov.w r2,r4
beq quit
movmd.b
quit:
rts/l er4-er6
#else
; Skip the first iteration if the number of longwords is divisible
; by 0x10000.
mov.w r4,r4
beq fast_loop_next
; This loop copies r4 (!= 0) longwords the first time round and 65536
; longwords on each iteration after that.
fast_loop:
movmd.l
fast_loop_next:
sub.w #1,e4
bhs fast_loop
; Mop up any left-over bytes. We could just fall through to the
; simple code after the "and" but the version below is quicker
; and only takes 10 more bytes.
and.w #3,r2
beq quit
mov.w r2,r4
movmd.b
quit:
rts/l er4-er6
simple:
; Simple bytewise copy. We need to handle all lengths, including zero.
mov.w r2,r4
beq simple_loop_next
simple_loop:
movmd.b
simple_loop_next:
sub.w #1,e2
bhs simple_loop
rts/l er4-er6
#endif
#else
.global _memcpy
_memcpy:
; MOVP @(2/4,r7),A0P ; dst
; MOVP @(4/8,r7),A1P ; src
; MOVP @(6/12,r7),A2P ; len
MOVP A0P,A3P ; keep copy of final dst
ADDP A2P,A0P ; point to end of dst
CMPP A0P,A3P ; see if anything to do
beq quit
ADDP A2P,A1P ; point to end of src
; lets see if we can do this in words
or A0L,A2L ; or in the dst address
or A3L,A2L ; or the length
or A1L,A2L ; or the src address
btst #0,A2L ; see if the lsb is zero
bne byteloop
wordloop:
#ifdef __NORMAL_MODE__
sub #2,A1P
#else
subs #2,A1P ; point to word
#endif
mov.w @A1P,A2 ; get word
mov.w A2,@-A0P ; save word
CMPP A0P,A3P ; at the front again ?
bne wordloop
rts
byteloop:
#ifdef __NORMAL_MODE__
sub #1,A1P
#else
subs #1,A1P ; point to byte
#endif
mov.b @A1P,A2L ; get byte
mov.b A2L,@-A0P ; save byte
CMPP A0P,A3P ; at the front again ?
bne byteloop
; return with A0 pointing to dst
quit: rts
#endif