mirror of
https://github.com/cc65/cc65.git
synced 2025-01-14 00:32:08 +00:00
Improved memset/memcpy/memmove functions by Christian Krueger.
git-svn-id: svn://svn.cc65.org/cc65/trunk@4200 b7a2c559-68d2-44c3-8de9-860c34a00d81
This commit is contained in:
parent
2153cc46db
commit
0e91f15ea8
@ -1,5 +1,7 @@
|
||||
;
|
||||
; Ullrich von Bassewitz, 2003-08-20
|
||||
; Performance increase (about 20%) by
|
||||
; Christian Krueger, 2009-09-13
|
||||
;
|
||||
; void* __fastcall__ memcpy (void* dest, const void* src, size_t n);
|
||||
;
|
||||
@ -10,61 +12,69 @@
|
||||
|
||||
.export _memcpy, memcpy_upwards, memcpy_getparams
|
||||
.import popax
|
||||
.importzp ptr1, ptr2, ptr3, tmp1
|
||||
.importzp sp, ptr1, ptr2, ptr3
|
||||
|
||||
; ----------------------------------------------------------------------
|
||||
_memcpy:
|
||||
jsr memcpy_getparams
|
||||
|
||||
memcpy_upwards:
|
||||
ldy #0
|
||||
ldx ptr3 ; Get low counter byte
|
||||
memcpy_upwards: ; assert Y = 0
|
||||
ldx ptr3+1 ; Get high byte of n
|
||||
beq L2 ; Jump if zero
|
||||
|
||||
; Copy loop
|
||||
|
||||
@L1: inx ; Bump low counter byte
|
||||
beq @L3 ; Jump on overflow
|
||||
@L2: lda (ptr1),y
|
||||
sta (ptr2),y
|
||||
L1: .repeat 2 ; Unroll this a bit to make it faster...
|
||||
lda (ptr1),Y ; copy a byte
|
||||
sta (ptr2),Y
|
||||
iny
|
||||
bne @L1
|
||||
inc ptr1+1 ; Bump pointers
|
||||
.endrepeat
|
||||
bne L1
|
||||
inc ptr1+1
|
||||
inc ptr2+1
|
||||
bne @L1 ; Branch always
|
||||
@L3: inc ptr3+1 ; Bump high counter byte
|
||||
bne @L2
|
||||
dex ; Next 256 byte block
|
||||
bne L1 ; Repeat if any
|
||||
|
||||
; Done. The low byte of dest is still in ptr2
|
||||
; the following section could be 10% faster if we were able to copy
|
||||
; back to front - unfortunately we are forced to copy strict from
|
||||
; low to high since this function is also used for
|
||||
; memmove and blocks could be overlapping!
|
||||
; {
|
||||
L2: ; assert Y = 0
|
||||
ldx ptr3 ; Get the low byte of n
|
||||
beq done ; something to copy
|
||||
|
||||
done: lda ptr2
|
||||
ldx tmp1 ; get function result (dest)
|
||||
rts
|
||||
L3: lda (ptr1),Y ; copy a byte
|
||||
sta (ptr2),Y
|
||||
iny
|
||||
dex
|
||||
bne L3
|
||||
|
||||
; }
|
||||
|
||||
done: jmp popax ; Pop ptr and return as result
|
||||
|
||||
; ----------------------------------------------------------------------
|
||||
; Get the parameters from stack as follows:
|
||||
;
|
||||
; -(size-1) --> ptr3
|
||||
; size --> ptr3
|
||||
; src --> ptr1
|
||||
; dest --> ptr2
|
||||
; high(dest) --> tmp1
|
||||
;
|
||||
; dest is returned in a/x.
|
||||
; First argument (dest) will remain on stack and is returned in a/x!
|
||||
|
||||
memcpy_getparams:
|
||||
eor #$FF
|
||||
memcpy_getparams: ; IMPORTANT! Function has to leave with Y=0!
|
||||
sta ptr3
|
||||
txa
|
||||
eor #$FF
|
||||
sta ptr3+1 ; Save -(size-1)
|
||||
stx ptr3+1 ; save n to ptr3
|
||||
|
||||
jsr popax ; src
|
||||
jsr popax
|
||||
sta ptr1
|
||||
stx ptr1+1
|
||||
stx ptr1+1 ; save src to ptr1
|
||||
|
||||
jsr popax ; dest
|
||||
; save dest to ptr2
|
||||
ldy #1 ; (direct stack access is three cycles faster
|
||||
; (total cycle count with return))
|
||||
lda (sp),y
|
||||
tax
|
||||
stx ptr2+1 ; save high byte of ptr2
|
||||
dey ; Y = 0
|
||||
lda (sp),y ; Get ptr2 low
|
||||
sta ptr2
|
||||
stx ptr2+1 ; Save work copy
|
||||
stx tmp1 ; Save for function result
|
||||
|
||||
rts
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
;
|
||||
; Ullrich von Bassewitz, 2003-08-20
|
||||
; Performance increase (about 20%) by
|
||||
; Christian Krueger, 2009-09-13
|
||||
;
|
||||
; void* __fastcall__ memmove (void* dest, const void* src, size_t size);
|
||||
;
|
||||
@ -7,7 +9,7 @@
|
||||
;
|
||||
|
||||
.export _memmove
|
||||
.import memcpy_getparams, memcpy_upwards
|
||||
.import memcpy_getparams, memcpy_upwards, popax
|
||||
.importzp ptr1, ptr2, ptr3, ptr4, tmp1
|
||||
|
||||
.macpack generic
|
||||
@ -15,9 +17,6 @@
|
||||
|
||||
; ----------------------------------------------------------------------
|
||||
_memmove:
|
||||
sta ptr4
|
||||
stx ptr4+1 ; Size -> ptr4
|
||||
|
||||
jsr memcpy_getparams
|
||||
|
||||
; Check for the copy direction. If dest < src, we must copy upwards (start at
|
||||
@ -33,35 +32,53 @@ _memmove:
|
||||
; Copy downwards. Adjust the pointers to the end of the memory regions.
|
||||
|
||||
lda ptr1+1
|
||||
add ptr4+1
|
||||
add ptr3+1
|
||||
sta ptr1+1
|
||||
|
||||
lda ptr2+1
|
||||
add ptr4+1
|
||||
add ptr3+1
|
||||
sta ptr2+1
|
||||
|
||||
; Load the low offset into Y, and the counter low byte into X.
|
||||
; handle fractions of a page size first
|
||||
|
||||
ldy ptr4
|
||||
ldx ptr3
|
||||
jmp @L2
|
||||
ldy ptr3 ; count, low byte
|
||||
bne @entry ; something to copy?
|
||||
beq PageSizeCopy ; here like bra...
|
||||
|
||||
; Copy loop
|
||||
|
||||
@L1: dey
|
||||
@copyByte:
|
||||
lda (ptr1),y
|
||||
sta (ptr2),y
|
||||
@entry:
|
||||
dey
|
||||
bne @copyByte
|
||||
lda (ptr1),y ; copy remaining byte
|
||||
sta (ptr2),y
|
||||
|
||||
@L2: inx ; Bump counter low byte
|
||||
bne @L1
|
||||
dec ptr1+1
|
||||
PageSizeCopy: ; assert Y = 0
|
||||
ldx ptr3+1 ; number of pages
|
||||
beq done ; none? -> done
|
||||
|
||||
@initBase:
|
||||
dec ptr1+1 ; adjust base...
|
||||
dec ptr2+1
|
||||
inc ptr3+1 ; Bump counter high byte
|
||||
bne @L1
|
||||
dey ; in entry case: 0 -> FF
|
||||
lda (ptr1),y ; need to copy this 'intro byte'
|
||||
sta (ptr2),y ; to 'land' later on Y=0! (as a result of the '.repeat'-block!)
|
||||
dey ; FF ->FE
|
||||
@copyBytes:
|
||||
.repeat 2 ; Unroll this a bit to make it faster...
|
||||
lda (ptr1),y
|
||||
sta (ptr2),y
|
||||
dey
|
||||
.endrepeat
|
||||
@copyEntry: ; in entry case: 0 -> FF
|
||||
bne @copyBytes
|
||||
lda (ptr1),y ; Y = 0, copy last byte
|
||||
sta (ptr2),y
|
||||
dex ; one page to copy less
|
||||
bne @initBase ; still a page to copy?
|
||||
|
||||
; Done, return dest
|
||||
|
||||
done: lda ptr2
|
||||
ldx tmp1 ; get function result (dest)
|
||||
rts
|
||||
done: jmp popax ; Pop ptr and return as result
|
||||
|
||||
|
@ -1,9 +1,11 @@
|
||||
;
|
||||
; void* memset (void* ptr, int c, size_t n);
|
||||
; void* _bzero (void* ptr, size_t n);
|
||||
; void bzero (void* ptr, size_t n);
|
||||
; void* __fastcall__ memset (void* ptr, int c, size_t n);
|
||||
; void* __fastcall__ _bzero (void* ptr, size_t n);
|
||||
; void __fastcall__ bzero (void* ptr, size_t n);
|
||||
;
|
||||
; Ullrich von Bassewitz, 29.05.1998
|
||||
; Performance increase (about 20%) by
|
||||
; Christian Krueger, 12.09.2009
|
||||
;
|
||||
; NOTE: bzero will return it's first argument as memset does. It is no problem
|
||||
; to declare the return value as void, since it may be ignored. _bzero
|
||||
@ -15,57 +17,79 @@
|
||||
|
||||
.export _memset, _bzero, __bzero
|
||||
.import popax
|
||||
.importzp sp, ptr1, ptr2, ptr3, tmp1
|
||||
.importzp sp, ptr1, ptr2, ptr3
|
||||
|
||||
_bzero:
|
||||
__bzero:
|
||||
sta ptr3
|
||||
stx ptr3+1 ; Save n
|
||||
lda #0 ; Fill with zeros
|
||||
ldx #0 ; Fill with zeros
|
||||
beq common
|
||||
|
||||
_memset:
|
||||
sta ptr3 ; Save n
|
||||
stx ptr3+1
|
||||
jsr popax ; Get c
|
||||
tax
|
||||
|
||||
; Common stuff for memset and bzero from here
|
||||
|
||||
common: sta tmp1 ; Save the fill value
|
||||
common: ; Fill value is in X!
|
||||
ldy #1
|
||||
lda (sp),y
|
||||
tax
|
||||
dey
|
||||
sta ptr1+1 ; save high byte of ptr
|
||||
dey ; Y = 0
|
||||
lda (sp),y ; Get ptr
|
||||
sta ptr1
|
||||
stx ptr1+1 ; Save work copy
|
||||
|
||||
lda tmp1 ; Load fill value
|
||||
ldy #0
|
||||
lsr ptr3+1 ; divide number of
|
||||
ror ptr3 ; bytes by two to increase
|
||||
bcc evenCount ; speed (ptr3 = ptr3/2)
|
||||
oddCount:
|
||||
; y is still 0 here
|
||||
txa ; restore fill value
|
||||
sta (ptr1),y ; save value and increase
|
||||
inc ptr1 ; dest. pointer
|
||||
bne evenCount
|
||||
inc ptr1+1
|
||||
evenCount:
|
||||
lda ptr1 ; build second pointer section
|
||||
clc
|
||||
adc ptr3 ; ptr2 = ptr1 + (length/2) <- ptr3
|
||||
sta ptr2
|
||||
lda ptr1+1
|
||||
adc ptr3+1
|
||||
sta ptr2+1
|
||||
|
||||
txa ; restore fill value
|
||||
ldx ptr3+1 ; Get high byte of n
|
||||
beq L2 ; Jump if zero
|
||||
|
||||
; Set 256 byte blocks
|
||||
|
||||
; Set 256/512 byte blocks
|
||||
; y is still 0 here
|
||||
L1: .repeat 2 ; Unroll this a bit to make it faster
|
||||
sta (ptr1),y ; Set one byte
|
||||
sta (ptr1),y ; Set byte in lower section
|
||||
sta (ptr2),y ; Set byte in upper section
|
||||
iny
|
||||
.endrepeat
|
||||
bne L1
|
||||
inc ptr1+1
|
||||
inc ptr2+1
|
||||
dex ; Next 256 byte block
|
||||
bne L1 ; Repeat if any
|
||||
|
||||
; Set the remaining bytes if any
|
||||
|
||||
L2: ldx ptr3 ; Get the low byte of n
|
||||
beq L9 ; Low byte is zero
|
||||
L2: ldy ptr3 ; Get the low byte of n
|
||||
bne L3 ; something to set?
|
||||
jmp popax ; no -> Pop ptr and return as result
|
||||
|
||||
L3: sta (ptr1),y ; Set one byte
|
||||
iny
|
||||
dex ; Done?
|
||||
bne L3
|
||||
|
||||
L9: jmp popax ; Pop ptr and return as result
|
||||
L3a: sta (ptr1),y ; set bytes in low
|
||||
sta (ptr2),y ; and high section
|
||||
L3: dey
|
||||
bne L3a
|
||||
sta (ptr1),y ; Set remaining byte(s)
|
||||
sta (ptr2),y
|
||||
jmp popax ; Pop ptr and return as result
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user