#440: Raphael's assembly VMX haschr/memchr/strchr plus NSPR build changes

This commit is contained in:
Cameron Kaiser 2019-01-12 13:03:22 -08:00
parent 2dba79e825
commit fa5198b512
5 changed files with 860 additions and 3 deletions

View File

@ -134,6 +134,7 @@ endif
ifndef OBJS
OBJS = $(addprefix $(OBJDIR)/,$(CSRCS:.c=.$(OBJ_SUFFIX))) \
$(addprefix $(OBJDIR)/,$(SSRCS:.S=.$(OBJ_SUFFIX))) \
$(addprefix $(OBJDIR)/,$(ASFILES:.$(ASM_SUFFIX)=.$(OBJ_SUFFIX)))
endif
@ -458,6 +459,13 @@ endif
endif
endif
$(OBJDIR)/%.$(OBJ_SUFFIX): %.S
@$(MAKE_OBJDIR)
ifdef NEED_ABSOLUTE_PATH
$(CC) -o $@ -c $(CFLAGS) $(call pr_abspath,$<)
else
$(CC) -o $@ -c $(CFLAGS) $<
endif
$(OBJDIR)/%.$(OBJ_SUFFIX): %.s
@$(MAKE_OBJDIR)
@ -509,7 +517,7 @@ endif
# hundreds of built-in suffix rules for stuff we don't need.
#
.SUFFIXES:
.SUFFIXES: .a .$(OBJ_SUFFIX) .c .cpp .s .h .i .pl
.SUFFIXES: .a .$(OBJ_SUFFIX) .c .cpp .s .h .i .pl .S
#
# Fake targets. Always run these rules, even if a file/directory with that

View File

@ -40,8 +40,11 @@ LIBRARY_VERSION = $(MOD_MAJOR_VERSION)
RELEASE_LIBS = $(TARGETS)
ifeq ($(TENFOURFOX_VMX),1)
CSRCS += plvmx.c
CFLAGS += -faltivec
SSRCS += \
vmx_haschr.S \
vmx_memchr.S \
vmx_strchr.S \
$(NULL)
endif
ifeq ($(OS_ARCH),WINNT)

View File

@ -0,0 +1,229 @@
; VMX version of memchr in assembly
;
; Does not make stack frames or update the stack pointer.
;
; It uses Darwin's red zone to load/store vector values,
; and part of the code assumes memory loads are BE ordered.
;
; r3: const void *b (input param)
; r4: int c (input param)
; r5: size_t length (input param)
;
; All GPRs used are volatile.
#define VRSAVE 256
#define VMX_ALL_NE 26
#define VMX_ALL_EQ 24
#ifdef _PPC970_
#define PADDING nop
#warning using 64-bit code
.machine ppc970
#else
#define PADDING
.machine ppc7400
#endif
.text
.globl _vmx_haschr
.align 4
_vmx_haschr:
mfspr r12,VRSAVE ;Get old VRSAVE
cmplwi cr0,r5,16
neg r11,r3
add r7,r3,r5
;prefix length in r6, suffix length in r7
clrlwi r6,r11,28
clrlwi r7,r7,28 ;logical AND 0xF
;total length - prefix length
sub r8,r5,r6
;prefix/total predecrements in its loop
addi r6,r6,1
addi r9,r5,1
;bytes into quadwords
srwi r8,r8,4 ;suffix is < 16bytes, no need to subtract it.
;if total length is < 16, skip vmx
blt cr0,L_novmx
cmplwi cr1,r8,0
;new VRSAVE
oris r0,r12,0xE000 ;VR0-VR2 = 0xE0000000
li r11,-16
stb r4,-16(r1) ;store searchByte in red zone
mtctr r6
beq cr1,L_novmx
;VMX is used
mtspr VRSAVE,r0
;load, splat searchByte in VR0
lvebx v0,r1,r11
vspltb v0,v0,0
;truncate int to uchar
clrlwi r4,r4,24
L_prefix_loop: ;prefix loop
;check for a prefix before actually looping
bdz L_prefix_end
lbz r9,0(r3)
addi r3,r3,1
cmplw cr1,r4,r9
bne cr1,L_prefix_loop
mtspr VRSAVE,r12 ;found the byte. c'ya ;-)
li r3,1
blr
L_prefix_end:
;check if there is a suffix
cmplwi cr0,r7,0
mtctr r8
;first VMX iteration is outside the loop
lvx v1,0,r3
li r10,16
vcmpequb. v2,v1,v0
bdz L_vmx_end
.align 4
L_vmx_loop: ;vector loop
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_found
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_found
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_found
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_found
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_found
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_found
PADDING
vcmpequb. v2,v1,v0
bdnz L_vmx_loop
L_vmx_end:
add r3,r3,r10
bf VMX_ALL_NE,L_found
mtctr r7
;skip suffix if nonexistent
beq cr0,L_notfound
.align 4
L_suffix_loop: ;suffix loop
lbz r5,0(r3)
addi r3,r3,1
cmplw cr0,r4,r5
beq cr0,L_found
bdnz L_suffix_loop
L_notfound:
mtspr VRSAVE,r12
li r3,0
blr
L_found:
mtspr VRSAVE,r12
li r3,1
blr
;Path that skips VMX
L_novmx:
mtctr r9
clrlwi r4,r4,24
L_novmx_loop:
bdz L_notfound_1
lbz r5,0(r3)
addi r3,r3,1
cmpw cr0,r4,r5
bne cr0,L_novmx_loop
L_found_1:
la r3,-1(r3)
blr
L_notfound_1:
li r3,0
blr
.subsections_via_symbols

View File

@ -0,0 +1,311 @@
; VMX version of memchr in assembly
;
; Does not make stack frames or update the stack pointer.
;
; It uses Darwin's red zone to load/store vector values,
; and part of the code assumes memory loads are BE ordered.
;
; r3: const void *b (input param)
; r4: int c (input param)
; r5: size_t length (input param)
;
; All GPRs used are volatile.
#define VRSAVE 256
#define VMX_ALL_NE 26
#define VMX_ALL_EQ 24
#ifdef _PPC970_
#define PADDING nop
#warning using 64-bit code
.machine ppc970
#else
#define PADDING
.machine ppc7400
#endif
.text
.globl _vmx_memchr
.align 4
_vmx_memchr:
mfspr r12,VRSAVE ;Get old VRSAVE
cmplwi cr0,r5,16
neg r11,r3
add r7,r3,r5
;prefix length in r6, suffix length in r7
clrlwi r6,r11,28
clrlwi r7,r7,28 ;logical AND 0xF
;total length - prefix length
sub r8,r5,r6
;prefix/total predecrements in its loop
addi r6,r6,1
addi r9,r5,1
;bytes into quadwords
srwi r8,r8,4 ;suffix is < 16bytes, no need to subtract it.
;if total length is < 16, skip vmx
blt cr0,L_novmx
cmplwi cr1,r8,0
;new VRSAVE
oris r0,r12,0xE000 ;VR0-VR2 = 0xE0000000
li r11,-16
stb r4,-16(r1) ;store searchByte in red zone
mtctr r6
beq cr1,L_novmx
;VMX is used
mtspr VRSAVE,r0
;load, splat searchByte in VR0
lvebx v0,r1,r11
vspltb v0,v0,0
;truncate int to uchar
clrlwi r4,r4,24
L_prefix_loop: ;prefix loop
;check for a prefix before actually looping
bdz L_prefix_end
lbz r9,0(r3)
addi r3,r3,1
cmplw cr1,r4,r9
bne cr1,L_prefix_loop
mtspr VRSAVE,r12 ;found the byte. c'ya ;-)
la r3,-1(r3)
blr
L_prefix_end:
;check if there is a suffix
cmplwi cr0,r7,0
mtctr r8
;first VMX iteration is outside the loop
lvx v1,0,r3
li r10,16
vcmpequb. v2,v1,v0
bdz L_vmx_end
.align 4
L_vmx_loop: ;vector loop
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_foundvmx
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_foundvmx
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_foundvmx
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_foundvmx
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_foundvmx
PADDING
vcmpequb. v2,v1,v0
bdz L_vmx_end
lvx v1,r3,r10
la r10,16(r10)
PADDING
bf VMX_ALL_NE,L_foundvmx
PADDING
vcmpequb. v2,v1,v0
bdnz L_vmx_loop
L_vmx_end:
add r3,r3,r10
bf VMX_ALL_NE,L_foundvmx_1
mtctr r7
;skip suffix if nonexistent
beq cr0,L_notfound
.align 4
L_suffix_loop: ;suffix loop
lbz r5,0(r3)
addi r3,r3,1
cmplw cr0,r4,r5
beq cr0,L_found
bdnz L_suffix_loop
L_notfound:
mtspr VRSAVE,r12
li r3,0
blr
L_found:
mtspr VRSAVE,r12
la r3,-1(r3)
blr
.align 4
L_foundvmx:
;add the
subi r3,r3,16
add r3,r3,r10
L_foundvmx_1:
stvx v2,r1,r11 ;store result vector in the red zone
mtspr VRSAVE,r12 ;restore VRSAVE
;make r3 point the "good" quadword
subi r3,r3,16
;From here, the result vector from last executed vcmpequb is stored in -16(r1).
;
;Following part searches the 'good' byte in GPRs (32 or 64 bits GPRS according to cpu)
;It loads the vector containing the 'good' byte in GPRs, searches for first non-zero
;GPR, counts the leading 0's and divides this by 8.
;This gives the 'good' byte's position inside the GPR from MS Byte to LS BYTE
;(0 MS, 3 LS).
;This cuts the number of checks to do from 16 to 4 (32bit GPRs) or 2 (64bit GPRs).
#ifdef _PPC970_
ld r5,-16(r1)
ld r6,-8(r1)
cmpldi cr0,r5,0
cntlzd r4,r5
bne cr0,L_bytefound ;0-7
addi r3,r3,8 ;8-15
cntlzd r4,r6
#else
;PPC32
lwz r5,-16(r1)
lwz r6,-12(r1)
cmplwi cr0,r5,0
cntlzw r4,r5
cmplwi cr1,r6,0
lwz r7,-8(r1)
bne cr0,L_bytefound ;0-3
addi r3,r3,4
cntlzw r4,r6
bne cr1,L_bytefound ;4-7
cmplwi cr0,r7,0
addi r3,r3,4
cntlzw r4,r7
lwz r8,-4(r1)
bne cr0,L_bytefound ;8-11
cntlzw r4,r8 ;12-15
addi r3,r3,4
#endif
L_bytefound:
PADDING
srwi r6,r4,3
PADDING
add r3,r6,r3
blr
;Path that skips VMX
L_novmx:
mtctr r9
clrlwi r4,r4,24
L_novmx_loop:
bdz L_notfound_1
lbz r5,0(r3)
addi r3,r3,1
cmpw cr0,r4,r5
bne cr0,L_novmx_loop
L_found_1:
la r3,-1(r3)
blr
L_notfound_1:
li r3,0
blr
.subsections_via_symbols

View File

@ -0,0 +1,306 @@
; VMX version of strchr in assembly
;
; Does not make stack frames or update the stack pointer.
;
; It uses Darwin's red zone to load/store vector values,
; and part of the code assumes memory loads are BE ordered.
;
; r3: const void *p (input param)
; r4: int ch (input param)
;
; All GPRs used are volatile.
#define VRSAVE 256
#define VMX_ALL_NE 26
#define VMX_ALL_EQ 24
#ifdef _PPC970_
#warning using 64-bit code
.machine ppc970
#else
.machine ppc7400
#endif
.text
.align 4
.globl _vmx_strchr
_vmx_strchr:
;truncate value to 8bit, update CR0
clrlwi. r4,r4,24
neg r6,r3
mfspr r12,VRSAVE ;get old VRSAVE
li r11,-16
stb r4,-16(r1)
;prefix length in r6
clrlwi r6,r6,28 ;logical AND 0xF
oris r9,r12,0xFE00 ;VR0-VR6 = 0xFE000000
mtspr VRSAVE,r9
vspltisb v0,0
vspltisb v6,7
addi r6,r6,1
mtctr r6
;If we search for NULL, use specific code
beq cr0,L_NULL_ONLY
lvebx v3,r1,r11
vspltb v3,v3,0
;check if there is a prefix first
bdz L_prefix_end
lbz r5,0(r3)
.align 4
L_prefix_loop:
addi r3,r3,1
cmpw cr0,r4,r5
cmpwi cr1,r5,0
beq cr0,L_found
beq cr1,L_notfound_1
lbz r5,0(r3)
bdnz L_prefix_loop
;The end-of-string and searchByte vectors
;are compared against memory. Both result vectors are
;then compared together. If there's no end-of-string
;or searchByte found, both result vectors should be
;filled with 0's.
;
;If anything is found there will be an inequality between
;the two.
;
;(Not applicable if the searchByte in question is NULL)
L_prefix_end:
lvx v1,0,r3
vcmpequb v4,v1,v0
vcmpequb v2,v1,v3
li r10,16
vcmpequb. v5,v2,v4
bf 24,L_foundsomething
.align 4
L_vmx_loop:
lvx v1,r3,r10
vcmpequb v4,v1,v0
vcmpequb v2,v1,v3
la r10,16(r10)
vcmpequb. v5,v2,v4
bf VMX_ALL_EQ,L_foundsomething
lvx v1,r3,r10
vcmpequb v4,v1,v0
vcmpequb v2,v1,v3
la r10,16(r10)
vcmpequb. v5,v2,v4
bf VMX_ALL_EQ,L_foundsomething
lvx v1,r3,r10
vcmpequb v4,v1,v0
vcmpequb v2,v1,v3
la r10,16(r10)
vcmpequb. v5,v2,v4
bf VMX_ALL_EQ,L_foundsomething
lvx v1,r3,r10
vcmpequb v4,v1,v0
vcmpequb v2,v1,v3
la r10,16(r10)
vcmpequb. v5,v2,v4
bf VMX_ALL_EQ,L_foundsomething
lvx v1,r3,r10
vcmpequb v4,v1,v0
vcmpequb v2,v1,v3
la r10,16(r10)
vcmpequb. v5,v2,v4
bt VMX_ALL_EQ,L_vmx_loop
L_foundsomething:
;end-of-string bytes will have a 0x07 pattern
;in case of equality instead of 0xFF.
vand v4,v4,v6
;OR the two result vectors together.
vor v2,v2,v4
L_found_NULL:
stvx v2,r1,r11
add r3,r3,r10
subi r3,r3,16
mtspr VRSAVE,r12
;both result vectors OR'd together are stored at -16(r1).
;load the vector in GPRs and stop at the first nonzero GPR.
;Count leading 0'; If the result is odd it's end-of-string.
;Otherwise it's the "good" byte.
;
;If we searched for NULL then it'll always be the good byte.
#ifdef _PPC970_
ld r5,-16(r1)
ld r6,-8(r1)
cmpldi cr0,r5,0
cntlzd r4,r5
bne cr0,L_bytefound ;0-7
addi r3,r3,8 ;8-15
cntlzd r4,r6
#else
;PPC32
lwz r5,-16(r1)
lwz r6,-12(r1)
cmplwi cr0,r5,0
cntlzw r4,r5
cmplwi cr1,r6,0
lwz r7,-8(r1)
bne cr0,L_bytefound ;0-3
addi r3,r3,4
cntlzw r4,r6
bne cr1,L_bytefound ;4-7
cmplwi cr0,r7,0
addi r3,r3,4
cntlzw r4,r7
lwz r8,-4(r1)
bne cr0,L_bytefound ;8-11
cntlzw r4,r8 ;12-15
addi r3,r3,4
#endif
L_bytefound:
clrlwi r7,r4,31 ;look if it's an odd number
srwi r6,r4,3
cmplwi cr0,r7,0
nop
bne cr0,L_notfound
add r3,r6,r3 ;update r3 to point to the good byte
blr
.align 4
;Code to execute when searching for NULL
L_NULL_ONLY:
bdz L_NULLVMX
lbz r5,0(r3)
addi r3,r3,1
cmplwi cr0,r5,0
bne cr0,L_NULL_ONLY ;prefix loop
la r3,-1(r3)
mtspr VRSAVE,r12
blr
L_NULLVMX:
lvx v1,0,r3
vcmpequb. v2,v1,v0
li r10,16
bc 4,26,L_found_NULL
L_NULL_LOOP:
lvx v1,r3,r10
vcmpequb. v2,v1,v0
la r10,16(r10)
bc 4,26,L_found_NULL
lvx v1,r3,r10
vcmpequb. v2,v1,v0
la r10,16(r10)
bc 4,26,L_found_NULL
lvx v1,r3,r10
vcmpequb. v2,v1,v0
la r10,16(r10)
bc 4,26,L_found_NULL
lvx v1,r3,r10
vcmpequb. v2,v1,v0
la r10,16(r10)
bc 4,26,L_found_NULL
lvx v1,r3,r10
vcmpequb. v2,v1,v0
la r10,16(r10)
bc 4,26,L_found_NULL
lvx v1,r3,r10
vcmpequb. v2,v1,v0
la r10,16(r10)
bc+ 12,26,L_NULL_LOOP
b L_found_NULL
L_notfound_1:
mtspr VRSAVE,r12
L_notfound:
li r3,0
blr
L_found:
mtspr VRSAVE,r12
la r3,-1(r3)
blr
.subsections_via_symbols