Use downwards copy on memcpy

Faster (5%), but larger memcpy (29 bytes) (no change for memmove)
This commit is contained in:
Colin Leroy-Mira 2024-02-14 19:58:27 +01:00
parent 3dfe033000
commit a310192da4
3 changed files with 108 additions and 76 deletions

View File

@ -10,45 +10,61 @@
; at memmove!
;
.export _memcpy, memcpy_upwards, memcpy_getparams
.export _memcpy, memcpy_downwards, memcpy_getparams
.import popax, popptr1
.importzp sp, ptr1, ptr2, ptr3
.macpack generic
; ----------------------------------------------------------------------
_memcpy:
jsr memcpy_getparams
memcpy_upwards: ; assert Y = 0
ldx ptr3+1 ; Get high byte of n
beq L2 ; Jump if zero
memcpy_downwards:
; Copy downwards. Adjust the pointers to the end of the memory regions.
L1: .repeat 2 ; Unroll this a bit to make it faster...
lda (ptr1),Y ; copy a byte
sta (ptr2),Y
iny
lda ptr1+1
add ptr3+1
sta ptr1+1
lda ptr2+1
add ptr3+1
sta ptr2+1
; handle fractions of a page size first
ldy ptr3 ; count, low byte
bne @entry ; something to copy?
beq PageSizeCopy ; here like bra...
@copyByte:
lda (ptr1),y
sta (ptr2),y
@entry:
dey
bne @copyByte
lda (ptr1),y ; copy remaining byte
sta (ptr2),y
PageSizeCopy: ; assert Y = 0
ldx ptr3+1 ; number of pages
beq done ; none? -> done
@initBase:
dec ptr1+1 ; adjust base...
dec ptr2+1
dey ; in entry case: 0 -> FF
@copyBytes:
.repeat 3 ; unroll this a bit to make it faster...
lda (ptr1),y ; important: unrolling three times gives a nice
sta (ptr2),y ; 255/3 = 85 loop which ends at 0
dey
.endrepeat
bne L1
inc ptr1+1
inc ptr2+1
dex ; Next 256 byte block
bne L1 ; Repeat if any
; the following section could be 10% faster if we were able to copy
; back to front - unfortunately we are forced to copy strict from
; low to high since this function is also used for
; memmove and blocks could be overlapping!
; {
L2: ; assert Y = 0
ldx ptr3 ; Get the low byte of n
beq done ; something to copy
L3: lda (ptr1),Y ; copy a byte
sta (ptr2),Y
iny
dex
bne L3
; }
@copyEntry: ; in entry case: 0 -> FF
bne @copyBytes
lda (ptr1),y ; Y = 0, copy last byte
sta (ptr2),y
dex ; one page to copy less
bne @initBase ; still a page to copy?
done: jmp popax ; Pop ptr and return as result

View File

@ -8,8 +8,8 @@
; NOTE: This function uses entry points from memcpy!
;
.export _memmove
.import memcpy_getparams, memcpy_upwards, popax
.export _memmove, memcpy_upwards
.import memcpy_getparams, memcpy_downwards, popax
.importzp ptr1, ptr2, ptr3, ptr4, tmp1
.macpack generic
@ -26,53 +26,32 @@ _memmove:
cmp ptr1
txa
sbc ptr1+1
jcc memcpy_upwards ; Branch if dest < src (upwards copy)
jcs memcpy_downwards ; Branch if dest < src (upwards copy)
; Copy downwards. Adjust the pointers to the end of the memory regions.
memcpy_upwards:
ldx ptr3+1 ; Get high byte of n
beq L2 ; Jump if zero
lda ptr1+1
add ptr3+1
sta ptr1+1
lda ptr2+1
add ptr3+1
sta ptr2+1
; handle fractions of a page size first
ldy ptr3 ; count, low byte
bne @entry ; something to copy?
beq PageSizeCopy ; here like bra...
@copyByte:
lda (ptr1),y
sta (ptr2),y
@entry:
dey
bne @copyByte
lda (ptr1),y ; copy remaining byte
sta (ptr2),y
PageSizeCopy: ; assert Y = 0
ldx ptr3+1 ; number of pages
beq done ; none? -> done
@initBase:
dec ptr1+1 ; adjust base...
dec ptr2+1
dey ; in entry case: 0 -> FF
@copyBytes:
.repeat 3 ; unroll this a bit to make it faster...
lda (ptr1),y ; important: unrolling three times gives a nice
sta (ptr2),y ; 255/3 = 85 loop which ends at 0
dey
L1: .repeat 2 ; Unroll this a bit to make it faster...
lda (ptr1),Y ; copy a byte
sta (ptr2),Y
iny
.endrepeat
@copyEntry: ; in entry case: 0 -> FF
bne @copyBytes
lda (ptr1),y ; Y = 0, copy last byte
sta (ptr2),y
dex ; one page to copy less
bne @initBase ; still a page to copy?
bne L1
inc ptr1+1
inc ptr2+1
dex ; Next 256 byte block
bne L1 ; Repeat if any
L2: ; assert Y = 0
ldx ptr3 ; Get the low byte of n
beq done ; something to copy
L3: lda (ptr1),Y ; copy a byte
sta (ptr2),Y
iny
dex
bne L3
; Done, return dest

View File

@ -0,0 +1,37 @@
#include <string.h>
#include "unittest.h"
#define BufferSize 384 // test correct page passing (>256, multiple of 128 here)
static char Buffer1[BufferSize]; // +1 to move up (and down)
static char Buffer2[BufferSize]; // +1 to move up (and down)
TEST
{
unsigned i, v;
char* p;
for (i=0; i < BufferSize; ++i) {
Buffer1[i] = i;
Buffer2[i] = ~i;
}
memcpy(Buffer2, Buffer1, sizeof(Buffer1));
if (memcmp(Buffer1, Buffer2, sizeof(Buffer1))) {
printf("First memcpy failed\n");
exit(1);
}
for (i=0; i < BufferSize; ++i) {
Buffer1[i] = i;
Buffer2[i] = ~i;
}
memcpy(Buffer1, Buffer2, sizeof(Buffer1));
if (memcmp(Buffer2, Buffer1, sizeof(Buffer1))) {
printf("Second memcpy failed\n");
exit(1);
}
}
ENDTEST