From fa5198b512984365a8aa52aac4566efab70ed681 Mon Sep 17 00:00:00 2001 From: Cameron Kaiser Date: Sat, 12 Jan 2019 13:03:22 -0800 Subject: [PATCH] #440: Raphael's assembly VMX haschr/memchr/strchr plus NSPR build changes --- nsprpub/config/rules.mk | 10 +- nsprpub/lib/libc/src/Makefile.in | 7 +- nsprpub/lib/libc/src/vmx_haschr.S | 229 ++++++++++++++++++++++ nsprpub/lib/libc/src/vmx_memchr.S | 311 ++++++++++++++++++++++++++++++ nsprpub/lib/libc/src/vmx_strchr.S | 306 +++++++++++++++++++++++++++++ 5 files changed, 860 insertions(+), 3 deletions(-) create mode 100644 nsprpub/lib/libc/src/vmx_haschr.S create mode 100644 nsprpub/lib/libc/src/vmx_memchr.S create mode 100644 nsprpub/lib/libc/src/vmx_strchr.S diff --git a/nsprpub/config/rules.mk b/nsprpub/config/rules.mk index 1c8fdc9b8..e0518db73 100644 --- a/nsprpub/config/rules.mk +++ b/nsprpub/config/rules.mk @@ -134,6 +134,7 @@ endif ifndef OBJS OBJS = $(addprefix $(OBJDIR)/,$(CSRCS:.c=.$(OBJ_SUFFIX))) \ + $(addprefix $(OBJDIR)/,$(SSRCS:.S=.$(OBJ_SUFFIX))) \ $(addprefix $(OBJDIR)/,$(ASFILES:.$(ASM_SUFFIX)=.$(OBJ_SUFFIX))) endif @@ -458,6 +459,13 @@ endif endif endif +$(OBJDIR)/%.$(OBJ_SUFFIX): %.S + @$(MAKE_OBJDIR) +ifdef NEED_ABSOLUTE_PATH + $(CC) -o $@ -c $(CFLAGS) $(call pr_abspath,$<) +else + $(CC) -o $@ -c $(CFLAGS) $< +endif $(OBJDIR)/%.$(OBJ_SUFFIX): %.s @$(MAKE_OBJDIR) @@ -509,7 +517,7 @@ endif # hundreds of built-in suffix rules for stuff we don't need. # .SUFFIXES: -.SUFFIXES: .a .$(OBJ_SUFFIX) .c .cpp .s .h .i .pl +.SUFFIXES: .a .$(OBJ_SUFFIX) .c .cpp .s .h .i .pl .S # # Fake targets. Always run these rules, even if a file/directory with that diff --git a/nsprpub/lib/libc/src/Makefile.in b/nsprpub/lib/libc/src/Makefile.in index 0c44e3cf9..3bb26773e 100644 --- a/nsprpub/lib/libc/src/Makefile.in +++ b/nsprpub/lib/libc/src/Makefile.in @@ -40,8 +40,11 @@ LIBRARY_VERSION = $(MOD_MAJOR_VERSION) RELEASE_LIBS = $(TARGETS) ifeq ($(TENFOURFOX_VMX),1) -CSRCS += plvmx.c -CFLAGS += -faltivec +SSRCS += \ + vmx_haschr.S \ + vmx_memchr.S \ + vmx_strchr.S \ + $(NULL) endif ifeq ($(OS_ARCH),WINNT) diff --git a/nsprpub/lib/libc/src/vmx_haschr.S b/nsprpub/lib/libc/src/vmx_haschr.S new file mode 100644 index 000000000..04f58a2b2 --- /dev/null +++ b/nsprpub/lib/libc/src/vmx_haschr.S @@ -0,0 +1,229 @@ + +; VMX version of memchr in assembly +; +; Does not make stack frames or update the stack pointer. +; +; It uses Darwin's red zone to load/store vector values, +; and part of the code assumes memory loads are BE ordered. +; +; r3: const void *b (input param) +; r4: int c (input param) +; r5: size_t length (input param) +; +; All GPRs used are volatile. + + +#define VRSAVE 256 +#define VMX_ALL_NE 26 +#define VMX_ALL_EQ 24 + +#ifdef _PPC970_ + +#define PADDING nop + +#warning using 64-bit code + + .machine ppc970 + +#else + +#define PADDING + + .machine ppc7400 + +#endif + + .text + + .globl _vmx_haschr + + .align 4 + +_vmx_haschr: + + mfspr r12,VRSAVE ;Get old VRSAVE + + cmplwi cr0,r5,16 + + neg r11,r3 + add r7,r3,r5 + + ;prefix length in r6, suffix length in r7 + clrlwi r6,r11,28 + clrlwi r7,r7,28 ;logical AND 0xF + + ;total length - prefix length + sub r8,r5,r6 + + ;prefix/total predecrements in its loop + addi r6,r6,1 + addi r9,r5,1 + + ;bytes into quadwords + srwi r8,r8,4 ;suffix is < 16bytes, no need to subtract it. + + ;if total length is < 16, skip vmx + blt cr0,L_novmx + + cmplwi cr1,r8,0 + + ;new VRSAVE + oris r0,r12,0xE000 ;VR0-VR2 = 0xE0000000 + + li r11,-16 + stb r4,-16(r1) ;store searchByte in red zone + + mtctr r6 + + beq cr1,L_novmx + + ;VMX is used + mtspr VRSAVE,r0 + + ;load, splat searchByte in VR0 + lvebx v0,r1,r11 + vspltb v0,v0,0 + + ;truncate int to uchar + clrlwi r4,r4,24 + +L_prefix_loop: ;prefix loop + + ;check for a prefix before actually looping + bdz L_prefix_end + + lbz r9,0(r3) + addi r3,r3,1 + cmplw cr1,r4,r9 + + bne cr1,L_prefix_loop + + mtspr VRSAVE,r12 ;found the byte. c'ya ;-) + li r3,1 + + blr + + +L_prefix_end: + + ;check if there is a suffix + cmplwi cr0,r7,0 + + + mtctr r8 + + ;first VMX iteration is outside the loop + lvx v1,0,r3 + li r10,16 + vcmpequb. v2,v1,v0 + + bdz L_vmx_end + + .align 4 + +L_vmx_loop: ;vector loop + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_found + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_found + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_found + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_found + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_found + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_found + PADDING + vcmpequb. v2,v1,v0 + bdnz L_vmx_loop + +L_vmx_end: + + add r3,r3,r10 + bf VMX_ALL_NE,L_found + + mtctr r7 + + ;skip suffix if nonexistent + beq cr0,L_notfound + + .align 4 + +L_suffix_loop: ;suffix loop + + lbz r5,0(r3) + addi r3,r3,1 + cmplw cr0,r4,r5 + beq cr0,L_found + bdnz L_suffix_loop + +L_notfound: + + mtspr VRSAVE,r12 + li r3,0 + blr + +L_found: + + mtspr VRSAVE,r12 + li r3,1 + blr + + + ;Path that skips VMX + +L_novmx: + + mtctr r9 + clrlwi r4,r4,24 + +L_novmx_loop: + bdz L_notfound_1 + lbz r5,0(r3) + addi r3,r3,1 + cmpw cr0,r4,r5 + bne cr0,L_novmx_loop + +L_found_1: + la r3,-1(r3) + blr +L_notfound_1: + li r3,0 + blr + + .subsections_via_symbols + diff --git a/nsprpub/lib/libc/src/vmx_memchr.S b/nsprpub/lib/libc/src/vmx_memchr.S new file mode 100644 index 000000000..39082a043 --- /dev/null +++ b/nsprpub/lib/libc/src/vmx_memchr.S @@ -0,0 +1,311 @@ + +; VMX version of memchr in assembly +; +; Does not make stack frames or update the stack pointer. +; +; It uses Darwin's red zone to load/store vector values, +; and part of the code assumes memory loads are BE ordered. +; +; r3: const void *b (input param) +; r4: int c (input param) +; r5: size_t length (input param) +; +; All GPRs used are volatile. + + +#define VRSAVE 256 +#define VMX_ALL_NE 26 +#define VMX_ALL_EQ 24 + +#ifdef _PPC970_ + +#define PADDING nop + +#warning using 64-bit code + + .machine ppc970 + +#else + +#define PADDING + + .machine ppc7400 + +#endif + + .text + + .globl _vmx_memchr + + .align 4 + +_vmx_memchr: + + mfspr r12,VRSAVE ;Get old VRSAVE + + cmplwi cr0,r5,16 + + neg r11,r3 + add r7,r3,r5 + + ;prefix length in r6, suffix length in r7 + clrlwi r6,r11,28 + clrlwi r7,r7,28 ;logical AND 0xF + + ;total length - prefix length + sub r8,r5,r6 + + ;prefix/total predecrements in its loop + addi r6,r6,1 + addi r9,r5,1 + + ;bytes into quadwords + srwi r8,r8,4 ;suffix is < 16bytes, no need to subtract it. + + ;if total length is < 16, skip vmx + blt cr0,L_novmx + + cmplwi cr1,r8,0 + + ;new VRSAVE + oris r0,r12,0xE000 ;VR0-VR2 = 0xE0000000 + + li r11,-16 + stb r4,-16(r1) ;store searchByte in red zone + + mtctr r6 + + beq cr1,L_novmx + + ;VMX is used + mtspr VRSAVE,r0 + + ;load, splat searchByte in VR0 + lvebx v0,r1,r11 + vspltb v0,v0,0 + + ;truncate int to uchar + clrlwi r4,r4,24 + +L_prefix_loop: ;prefix loop + + ;check for a prefix before actually looping + bdz L_prefix_end + + lbz r9,0(r3) + addi r3,r3,1 + cmplw cr1,r4,r9 + + bne cr1,L_prefix_loop + + mtspr VRSAVE,r12 ;found the byte. c'ya ;-) + la r3,-1(r3) + + blr + + +L_prefix_end: + + ;check if there is a suffix + cmplwi cr0,r7,0 + + + mtctr r8 + + ;first VMX iteration is outside the loop + lvx v1,0,r3 + li r10,16 + vcmpequb. v2,v1,v0 + + bdz L_vmx_end + + .align 4 + +L_vmx_loop: ;vector loop + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_foundvmx + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_foundvmx + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_foundvmx + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_foundvmx + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_foundvmx + PADDING + vcmpequb. v2,v1,v0 + bdz L_vmx_end + + lvx v1,r3,r10 + la r10,16(r10) + PADDING + bf VMX_ALL_NE,L_foundvmx + PADDING + vcmpequb. v2,v1,v0 + bdnz L_vmx_loop + +L_vmx_end: + + add r3,r3,r10 + bf VMX_ALL_NE,L_foundvmx_1 + + mtctr r7 + + ;skip suffix if nonexistent + beq cr0,L_notfound + + .align 4 + +L_suffix_loop: ;suffix loop + + lbz r5,0(r3) + addi r3,r3,1 + cmplw cr0,r4,r5 + beq cr0,L_found + bdnz L_suffix_loop + +L_notfound: + + mtspr VRSAVE,r12 + li r3,0 + blr + +L_found: + + mtspr VRSAVE,r12 + la r3,-1(r3) + blr + + .align 4 + +L_foundvmx: + + ;add the + subi r3,r3,16 + add r3,r3,r10 + +L_foundvmx_1: + + stvx v2,r1,r11 ;store result vector in the red zone + + mtspr VRSAVE,r12 ;restore VRSAVE + + ;make r3 point the "good" quadword + subi r3,r3,16 + + +;From here, the result vector from last executed vcmpequb is stored in -16(r1). +; +;Following part searches the 'good' byte in GPRs (32 or 64 bits GPRS according to cpu) +;It loads the vector containing the 'good' byte in GPRs, searches for first non-zero +;GPR, counts the leading 0's and divides this by 8. +;This gives the 'good' byte's position inside the GPR from MS Byte to LS BYTE +;(0 MS, 3 LS). +;This cuts the number of checks to do from 16 to 4 (32bit GPRs) or 2 (64bit GPRs). + +#ifdef _PPC970_ + + ld r5,-16(r1) + ld r6,-8(r1) + + cmpldi cr0,r5,0 + cntlzd r4,r5 + + bne cr0,L_bytefound ;0-7 + + addi r3,r3,8 ;8-15 + cntlzd r4,r6 + +#else + + ;PPC32 + + lwz r5,-16(r1) + lwz r6,-12(r1) + + cmplwi cr0,r5,0 + cntlzw r4,r5 + + cmplwi cr1,r6,0 + + lwz r7,-8(r1) + + bne cr0,L_bytefound ;0-3 + + addi r3,r3,4 + cntlzw r4,r6 + + bne cr1,L_bytefound ;4-7 + + cmplwi cr0,r7,0 + + addi r3,r3,4 + cntlzw r4,r7 + lwz r8,-4(r1) + + bne cr0,L_bytefound ;8-11 + + cntlzw r4,r8 ;12-15 + addi r3,r3,4 + +#endif + +L_bytefound: + + PADDING + srwi r6,r4,3 + PADDING + add r3,r6,r3 + blr + + + ;Path that skips VMX + +L_novmx: + + mtctr r9 + clrlwi r4,r4,24 + +L_novmx_loop: + bdz L_notfound_1 + lbz r5,0(r3) + addi r3,r3,1 + cmpw cr0,r4,r5 + bne cr0,L_novmx_loop + +L_found_1: + la r3,-1(r3) + blr +L_notfound_1: + li r3,0 + blr + + .subsections_via_symbols + diff --git a/nsprpub/lib/libc/src/vmx_strchr.S b/nsprpub/lib/libc/src/vmx_strchr.S new file mode 100644 index 000000000..262bc9e83 --- /dev/null +++ b/nsprpub/lib/libc/src/vmx_strchr.S @@ -0,0 +1,306 @@ + +; VMX version of strchr in assembly +; +; Does not make stack frames or update the stack pointer. +; +; It uses Darwin's red zone to load/store vector values, +; and part of the code assumes memory loads are BE ordered. +; +; r3: const void *p (input param) +; r4: int ch (input param) +; +; All GPRs used are volatile. + +#define VRSAVE 256 +#define VMX_ALL_NE 26 +#define VMX_ALL_EQ 24 + +#ifdef _PPC970_ + +#warning using 64-bit code + + .machine ppc970 + + +#else + + .machine ppc7400 + +#endif + + .text + + .align 4 + + .globl _vmx_strchr + +_vmx_strchr: + + ;truncate value to 8bit, update CR0 + clrlwi. r4,r4,24 + + neg r6,r3 + + mfspr r12,VRSAVE ;get old VRSAVE + + li r11,-16 + stb r4,-16(r1) + + ;prefix length in r6 + clrlwi r6,r6,28 ;logical AND 0xF + + oris r9,r12,0xFE00 ;VR0-VR6 = 0xFE000000 + + mtspr VRSAVE,r9 + + vspltisb v0,0 + vspltisb v6,7 + + addi r6,r6,1 + mtctr r6 + + ;If we search for NULL, use specific code + beq cr0,L_NULL_ONLY + + lvebx v3,r1,r11 + vspltb v3,v3,0 + + ;check if there is a prefix first + bdz L_prefix_end + + lbz r5,0(r3) + + .align 4 + +L_prefix_loop: + + addi r3,r3,1 + + cmpw cr0,r4,r5 + cmpwi cr1,r5,0 + beq cr0,L_found + beq cr1,L_notfound_1 + lbz r5,0(r3) + + bdnz L_prefix_loop + +;The end-of-string and searchByte vectors +;are compared against memory. Both result vectors are +;then compared together. If there's no end-of-string +;or searchByte found, both result vectors should be +;filled with 0's. +; +;If anything is found there will be an inequality between +;the two. +; +;(Not applicable if the searchByte in question is NULL) + +L_prefix_end: + + lvx v1,0,r3 + vcmpequb v4,v1,v0 + vcmpequb v2,v1,v3 + li r10,16 + vcmpequb. v5,v2,v4 + bf 24,L_foundsomething + + .align 4 + +L_vmx_loop: + + lvx v1,r3,r10 + vcmpequb v4,v1,v0 + vcmpequb v2,v1,v3 + la r10,16(r10) + vcmpequb. v5,v2,v4 + bf VMX_ALL_EQ,L_foundsomething + + lvx v1,r3,r10 + vcmpequb v4,v1,v0 + vcmpequb v2,v1,v3 + la r10,16(r10) + vcmpequb. v5,v2,v4 + bf VMX_ALL_EQ,L_foundsomething + + lvx v1,r3,r10 + vcmpequb v4,v1,v0 + vcmpequb v2,v1,v3 + la r10,16(r10) + vcmpequb. v5,v2,v4 + bf VMX_ALL_EQ,L_foundsomething + + lvx v1,r3,r10 + vcmpequb v4,v1,v0 + vcmpequb v2,v1,v3 + la r10,16(r10) + vcmpequb. v5,v2,v4 + bf VMX_ALL_EQ,L_foundsomething + + lvx v1,r3,r10 + vcmpequb v4,v1,v0 + vcmpequb v2,v1,v3 + la r10,16(r10) + vcmpequb. v5,v2,v4 + bt VMX_ALL_EQ,L_vmx_loop + +L_foundsomething: + + ;end-of-string bytes will have a 0x07 pattern + ;in case of equality instead of 0xFF. + vand v4,v4,v6 + + ;OR the two result vectors together. + vor v2,v2,v4 + +L_found_NULL: + + stvx v2,r1,r11 + add r3,r3,r10 + subi r3,r3,16 + mtspr VRSAVE,r12 + +;both result vectors OR'd together are stored at -16(r1). +;load the vector in GPRs and stop at the first nonzero GPR. +;Count leading 0'; If the result is odd it's end-of-string. +;Otherwise it's the "good" byte. +; +;If we searched for NULL then it'll always be the good byte. + +#ifdef _PPC970_ + + ld r5,-16(r1) + ld r6,-8(r1) + + cmpldi cr0,r5,0 + cntlzd r4,r5 + + bne cr0,L_bytefound ;0-7 + + addi r3,r3,8 ;8-15 + cntlzd r4,r6 + +#else + + ;PPC32 + + lwz r5,-16(r1) + lwz r6,-12(r1) + + cmplwi cr0,r5,0 + cntlzw r4,r5 + + cmplwi cr1,r6,0 + + lwz r7,-8(r1) + + bne cr0,L_bytefound ;0-3 + + addi r3,r3,4 + cntlzw r4,r6 + + bne cr1,L_bytefound ;4-7 + + cmplwi cr0,r7,0 + + addi r3,r3,4 + cntlzw r4,r7 + lwz r8,-4(r1) + + bne cr0,L_bytefound ;8-11 + + cntlzw r4,r8 ;12-15 + addi r3,r3,4 + +#endif + +L_bytefound: + + clrlwi r7,r4,31 ;look if it's an odd number + srwi r6,r4,3 + cmplwi cr0,r7,0 + nop + + bne cr0,L_notfound + + add r3,r6,r3 ;update r3 to point to the good byte + blr + + .align 4 + + + +;Code to execute when searching for NULL + +L_NULL_ONLY: + + bdz L_NULLVMX + lbz r5,0(r3) + addi r3,r3,1 + cmplwi cr0,r5,0 + + bne cr0,L_NULL_ONLY ;prefix loop + + la r3,-1(r3) + mtspr VRSAVE,r12 + + blr + + +L_NULLVMX: + + lvx v1,0,r3 + vcmpequb. v2,v1,v0 + li r10,16 + bc 4,26,L_found_NULL + +L_NULL_LOOP: + + lvx v1,r3,r10 + vcmpequb. v2,v1,v0 + la r10,16(r10) + bc 4,26,L_found_NULL + + lvx v1,r3,r10 + vcmpequb. v2,v1,v0 + la r10,16(r10) + bc 4,26,L_found_NULL + + lvx v1,r3,r10 + vcmpequb. v2,v1,v0 + la r10,16(r10) + bc 4,26,L_found_NULL + + lvx v1,r3,r10 + vcmpequb. v2,v1,v0 + la r10,16(r10) + bc 4,26,L_found_NULL + + lvx v1,r3,r10 + vcmpequb. v2,v1,v0 + la r10,16(r10) + bc 4,26,L_found_NULL + + lvx v1,r3,r10 + vcmpequb. v2,v1,v0 + la r10,16(r10) + bc+ 12,26,L_NULL_LOOP + + b L_found_NULL + +L_notfound_1: + + mtspr VRSAVE,r12 + +L_notfound: + + li r3,0 + blr + +L_found: + + mtspr VRSAVE,r12 + la r3,-1(r3) + blr + + .subsections_via_symbols +