From 999d57401c657169c6694e33255cb3c3c81fe66c Mon Sep 17 00:00:00 2001 From: Greg King Date: Fri, 6 Nov 2015 23:59:19 -0500 Subject: [PATCH] Added a version of memset() that uses the HuC6280's TII instruction to get more speed. --- libsrc/pce/memcpy.s | 7 +++-- libsrc/pce/memset.s | 67 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 libsrc/pce/memset.s diff --git a/libsrc/pce/memcpy.s b/libsrc/pce/memcpy.s index e3b7bde34..1743912ea 100644 --- a/libsrc/pce/memcpy.s +++ b/libsrc/pce/memcpy.s @@ -8,9 +8,10 @@ ; ; void* __fastcall__ memcpy (void* dest, const void* src, size_t size); ; -; NOTE: This function contains entry points for memmove, which will resort -; to memcpy for an incrementing copy. Don't change this module without looking -; at "pce/memmove.s"! +; NOTE: This function contains entry points for memmove(), which resorts to +; memcpy() for incrementing copies. The PC-Engine memset() uses this memcpy() +; to fill memory quickly. Don't change this module without looking at +; "pce/memmove.s" and "pce/memset.s"! ; .export _memcpy diff --git a/libsrc/pce/memset.s b/libsrc/pce/memset.s new file mode 100644 index 000000000..45a78d533 --- /dev/null +++ b/libsrc/pce/memset.s @@ -0,0 +1,67 @@ +; +; This file, instead of "common/memset.s", will be assembled for the pce +; target. This version is smaller and faster because it uses a HuC6280 +; block-copy instruction. +; +; 1998-05-29, Ullrich von Bassewitz +; 2015-11-06, Greg King +; +; void* __fastcall__ _bzero (void* ptr, size_t n); +; void __fastcall__ bzero (void* ptr, size_t n); +; void* __fastcall__ memset (void* ptr, int c, size_t n); +; +; NOTE: bzero() will return its first argument, as memset() does. It is no +; problem to declare the return value as void, because it can be ignored. +; _bzero() (note the leading underscore) is declared with the proper +; return type because the compiler will replace memset() by _bzero() if +; the fill value is zero; and, the optimizer looks at the return type +; to see if the value in .XA is of any use. +; +; NOTE: This function uses entry points from "pce/memcpy.s"! +; + + .export __bzero, _bzero, _memset + + .import memcpy_getparams, memcpy_increment + .import pushax, popax + .importzp ptr1, ptr2, ptr3 + + .macpack longbranch + + +; ---------------------------------------------------------------------- +__bzero: +_bzero: pha + cla ; fill with zeros + jsr pushax ; (high byte isn't important) + pla + +_memset: + jsr memcpy_getparams + +; The fill byte is put at the beginning of the buffer; then, the buffer is +; copied to a second buffer that starts one byte above the start of the first +; buffer. Normally, we would use memmove() to avoid trouble; but here, we +; exploit that overlap, by using memcpy(). Therefore, the fill value is copied +; from each byte to the next byte, all the way to the end of the buffer. + + lda ptr1 ; get fill value + sta (ptr2) + + lda ptr3 ; count first byte + bne @L3 + dec ptr3+1 +@L3: dec a + sta ptr3 + ora ptr3+1 + jeq popax ; return ptr. if no more bytes + + lda ptr2 ; point to first buffer + ldx ptr2+1 + sta ptr1 + stx ptr1+1 + inc ptr2 ; point to second buffer + bne @L2 + inc ptr2+1 + +@L2: jmp memcpy_increment