Retro68/gcc/newlib/libc/machine/h8300/memcpy.S

#include "setarch.h"

#include "defines.h"

#ifdef __H8300SX__

	.global _memcpy
_memcpy:
	stm.l	er4-er6,@-er7

	; Set up source and destination pointers for movmd.
	mov.l	er0,er6
	mov.l	er1,er5

	; See whether the copy is long enough to use the movmd.l code.
	; Although the code can handle anything longer than 6 bytes,
	; it can be more expensive than movmd.b for small moves.
	; It's better to use a higher threshold to account for this.
	;
	; Note that the exact overhead of the movmd.l checks depends on
	; the alignments of the length and pointers.  They are faster when
	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
	; are 0.  This threshold is a compromise between the various cases.
	cmp	#16,LEN(r2)
	blo	simple

	; movmd.l only works for even addresses.  If one of the addresses
	; is odd and the other is not, fall back on a simple move.
	bld	#0,r5l
	bxor	#0,r6l
	bcs	simple

	; Make the addresses even.
	bld	#0,r5l
	bcc	word_aligned
	mov.b	@er5+,@er6+
	sub	#1,LEN(r2)

word_aligned:
	; See if copying one word would make the first operand longword
	; aligned.  Although this is only really worthwhile if it aligns
	; the second operand as well, it's no worse if doesn't, so it
	; hardly seems worth the overhead of a "band" check.
	bld	#1,r6l
	bcc	fast_copy
	mov.w	@er5+,@er6+
	sub	#2,LEN(r2)

fast_copy:
	; Set (e)r4 to the number of longwords to copy.
	mov	LEN(r2),LEN(r4)
	shlr	#2,LEN(r4)

#ifdef __NORMAL_MODE__
	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
	; is never reached with r4 == 0.
	movmd.l
	and.w	#3,r2
simple:
	mov.w	r2,r4
	beq	quit
	movmd.b
quit:
	rts/l	er4-er6
#else
	; Skip the first iteration if the number of longwords is divisible
	; by 0x10000.
	mov.w	r4,r4
	beq	fast_loop_next

	; This loop copies r4 (!= 0) longwords the first time round and 65536
	; longwords on each iteration after that.
fast_loop:
	movmd.l
fast_loop_next:
	sub.w	#1,e4
	bhs	fast_loop

	; Mop up any left-over bytes.  We could just fall through to the
	; simple code after the "and" but the version below is quicker
	; and only takes 10 more bytes.
	and.w	#3,r2
	beq	quit
	mov.w	r2,r4
	movmd.b
quit:
	rts/l	er4-er6

simple:
	; Simple bytewise copy.  We need to handle all lengths, including zero.
	mov.w	r2,r4
	beq	simple_loop_next
simple_loop:
	movmd.b
simple_loop_next:
	sub.w	#1,e2
	bhs	simple_loop
	rts/l	er4-er6
#endif

#else

	.global _memcpy
_memcpy:
;	MOVP	@(2/4,r7),A0P	; dst
;	MOVP	@(4/8,r7),A1P	; src
;	MOVP	@(6/12,r7),A2P	; len

	MOVP	A0P,A3P	; keep copy of final dst
	ADDP	A2P,A0P	; point to end of dst
	CMPP	A0P,A3P	; see if anything to do
	beq	quit

	ADDP	A2P,A1P	; point to end of src

	; lets see if we can do this in words
	or	A0L,A2L	; or in the dst address
	or	A3L,A2L	; or the length 
	or	A1L,A2L	; or the src address
	btst	#0,A2L	; see if the lsb is zero
	bne	byteloop

wordloop:
#ifdef __NORMAL_MODE__
	sub	#2,A1P
#else
	subs	#2,A1P		; point to word
#endif
	mov.w	@A1P,A2		; get word
	mov.w	A2,@-A0P	; save word
	CMPP	A0P,A3P		; at the front again ?
	bne 	wordloop
	rts

byteloop:
#ifdef __NORMAL_MODE__
	sub	#1,A1P
#else
	subs	#1,A1P		; point to byte
#endif
	mov.b	@A1P,A2L	; get byte
	mov.b	A2L,@-A0P	; save byte
	CMPP	A0P,A3P 	; at the front again ?
	bne 	byteloop

	; return with A0 pointing to dst
quit:	rts

#endif
re-add newlib 2017-04-11 21:13:36 +00:00			`#include "setarch.h"`

			`#include "defines.h"`

			`#ifdef __H8300SX__`

			`.global _memcpy`
			`_memcpy:`
			`stm.l er4-er6,@-er7`

			`; Set up source and destination pointers for movmd.`
			`mov.l er0,er6`
			`mov.l er1,er5`

			`; See whether the copy is long enough to use the movmd.l code.`
			`; Although the code can handle anything longer than 6 bytes,`
			`; it can be more expensive than movmd.b for small moves.`
			`; It's better to use a higher threshold to account for this.`
			`;`
			`; Note that the exact overhead of the movmd.l checks depends on`
			`; the alignments of the length and pointers. They are faster when`
			`; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values`
			`; are 0. This threshold is a compromise between the various cases.`
			`cmp #16,LEN(r2)`
			`blo simple`

			`; movmd.l only works for even addresses. If one of the addresses`
			`; is odd and the other is not, fall back on a simple move.`
			`bld #0,r5l`
			`bxor #0,r6l`
			`bcs simple`

			`; Make the addresses even.`
			`bld #0,r5l`
			`bcc word_aligned`
			`mov.b @er5+,@er6+`
			`sub #1,LEN(r2)`

			`word_aligned:`
			`; See if copying one word would make the first operand longword`
			`; aligned. Although this is only really worthwhile if it aligns`
			`; the second operand as well, it's no worse if doesn't, so it`
			`; hardly seems worth the overhead of a "band" check.`
			`bld #1,r6l`
			`bcc fast_copy`
			`mov.w @er5+,@er6+`
			`sub #2,LEN(r2)`

			`fast_copy:`
			`; Set (e)r4 to the number of longwords to copy.`
			`mov LEN(r2),LEN(r4)`
			`shlr #2,LEN(r4)`

			`#ifdef __NORMAL_MODE__`
			`; 16-bit pointers and size_ts: one movmd.l is enough. This code`
			`; is never reached with r4 == 0.`
			`movmd.l`
			`and.w #3,r2`
			`simple:`
			`mov.w r2,r4`
			`beq quit`
			`movmd.b`
			`quit:`
			`rts/l er4-er6`
			`#else`
			`; Skip the first iteration if the number of longwords is divisible`
			`; by 0x10000.`
			`mov.w r4,r4`
			`beq fast_loop_next`

			`; This loop copies r4 (!= 0) longwords the first time round and 65536`
			`; longwords on each iteration after that.`
			`fast_loop:`
			`movmd.l`
			`fast_loop_next:`
			`sub.w #1,e4`
			`bhs fast_loop`

			`; Mop up any left-over bytes. We could just fall through to the`
			`; simple code after the "and" but the version below is quicker`
			`; and only takes 10 more bytes.`
			`and.w #3,r2`
			`beq quit`
			`mov.w r2,r4`
			`movmd.b`
			`quit:`
			`rts/l er4-er6`

			`simple:`
			`; Simple bytewise copy. We need to handle all lengths, including zero.`
			`mov.w r2,r4`
			`beq simple_loop_next`
			`simple_loop:`
			`movmd.b`
			`simple_loop_next:`
			`sub.w #1,e2`
			`bhs simple_loop`
			`rts/l er4-er6`
			`#endif`

			`#else`

			`.global _memcpy`
			`_memcpy:`
			`; MOVP @(2/4,r7),A0P ; dst`
			`; MOVP @(4/8,r7),A1P ; src`
			`; MOVP @(6/12,r7),A2P ; len`

			`MOVP A0P,A3P ; keep copy of final dst`
			`ADDP A2P,A0P ; point to end of dst`
			`CMPP A0P,A3P ; see if anything to do`
			`beq quit`

			`ADDP A2P,A1P ; point to end of src`

			`; lets see if we can do this in words`
			`or A0L,A2L ; or in the dst address`
			`or A3L,A2L ; or the length`
			`or A1L,A2L ; or the src address`
			`btst #0,A2L ; see if the lsb is zero`
			`bne byteloop`

			`wordloop:`
			`#ifdef __NORMAL_MODE__`
			`sub #2,A1P`
			`#else`
			`subs #2,A1P ; point to word`
			`#endif`
			`mov.w @A1P,A2 ; get word`
			`mov.w A2,@-A0P ; save word`
			`CMPP A0P,A3P ; at the front again ?`
			`bne wordloop`
			`rts`

			`byteloop:`
			`#ifdef __NORMAL_MODE__`
			`sub #1,A1P`
			`#else`
			`subs #1,A1P ; point to byte`
			`#endif`
			`mov.b @A1P,A2L ; get byte`
			`mov.b A2L,@-A0P ; save byte`
			`CMPP A0P,A3P ; at the front again ?`
			`bne byteloop`

			`; return with A0 pointing to dst`
			`quit: rts`

			`#endif`