mirror of
https://github.com/cc65/cc65.git
synced 2024-11-17 09:07:32 +00:00
81 lines
2.6 KiB
ArmAsm
81 lines
2.6 KiB
ArmAsm
;
|
|
; Ullrich von Bassewitz, 2003-08-20
|
|
; Performance increase (about 20%) by
|
|
; Christian Krueger, 2009-09-13
|
|
;
|
|
; void* __fastcall__ memcpy (void* dest, const void* src, size_t n);
|
|
;
|
|
; NOTE: This function contains entry points for memmove, which will ressort
|
|
; to memcpy for an upwards copy. Don't change this module without looking
|
|
; at memmove!
|
|
;
|
|
|
|
.export _memcpy, memcpy_upwards, memcpy_getparams
|
|
.import popax
|
|
.importzp sp, ptr1, ptr2, ptr3
|
|
|
|
; ----------------------------------------------------------------------
|
|
_memcpy:
|
|
jsr memcpy_getparams
|
|
|
|
memcpy_upwards: ; assert Y = 0
|
|
ldx ptr3+1 ; Get high byte of n
|
|
beq L2 ; Jump if zero
|
|
|
|
L1: .repeat 2 ; Unroll this a bit to make it faster...
|
|
lda (ptr1),Y ; copy a byte
|
|
sta (ptr2),Y
|
|
iny
|
|
.endrepeat
|
|
bne L1
|
|
inc ptr1+1
|
|
inc ptr2+1
|
|
dex ; Next 256 byte block
|
|
bne L1 ; Repeat if any
|
|
|
|
; the following section could be 10% faster if we were able to copy
|
|
; back to front - unfortunately we are forced to copy strict from
|
|
; low to high since this function is also used for
|
|
; memmove and blocks could be overlapping!
|
|
; {
|
|
L2: ; assert Y = 0
|
|
ldx ptr3 ; Get the low byte of n
|
|
beq done ; something to copy
|
|
|
|
L3: lda (ptr1),Y ; copy a byte
|
|
sta (ptr2),Y
|
|
iny
|
|
dex
|
|
bne L3
|
|
|
|
; }
|
|
|
|
done: jmp popax ; Pop ptr and return as result
|
|
|
|
; ----------------------------------------------------------------------
|
|
; Get the parameters from stack as follows:
|
|
;
|
|
; size --> ptr3
|
|
; src --> ptr1
|
|
; dest --> ptr2
|
|
; First argument (dest) will remain on stack and is returned in a/x!
|
|
|
|
memcpy_getparams: ; IMPORTANT! Function has to leave with Y=0!
|
|
sta ptr3
|
|
stx ptr3+1 ; save n to ptr3
|
|
|
|
jsr popax
|
|
sta ptr1
|
|
stx ptr1+1 ; save src to ptr1
|
|
|
|
; save dest to ptr2
|
|
ldy #1 ; (direct stack access is three cycles faster
|
|
; (total cycle count with return))
|
|
lda (sp),y
|
|
tax
|
|
stx ptr2+1 ; save high byte of ptr2
|
|
dey ; Y = 0
|
|
lda (sp),y ; Get ptr2 low
|
|
sta ptr2
|
|
rts
|