From d7da827be8dc5d02f1b1206b7c61b82f6e187b5c Mon Sep 17 00:00:00 2001 From: IrgendwerA8 Date: Mon, 21 May 2018 18:18:01 +0200 Subject: [PATCH 1/4] Apply faster popptr1 to functions and/or use register instead of stack to save accu. --- libsrc/runtime/mul.s | 24 ++++++++++++------------ libsrc/runtime/mul8.s | 20 ++++++++++---------- libsrc/runtime/mulax3.s | 4 ++-- libsrc/runtime/mulax5.s | 4 ++-- libsrc/runtime/mulax7.s | 4 ++-- libsrc/runtime/mulax9.s | 4 ++-- 6 files changed, 30 insertions(+), 30 deletions(-) diff --git a/libsrc/runtime/mul.s b/libsrc/runtime/mul.s index 6344e3a32..a9b53293b 100644 --- a/libsrc/runtime/mul.s +++ b/libsrc/runtime/mul.s @@ -6,8 +6,8 @@ .export tosumulax, tosmulax .import mul8x16, mul8x16a ; in mul8.s - .import popsreg - .importzp sreg, tmp1, ptr4 + .import popptr1 + .importzp tmp1, ptr1, ptr4 ;--------------------------------------------------------------------------- @@ -19,12 +19,12 @@ tosumulax: txa ; High byte zero beq @L3 ; Do 8x16 multiplication if high byte zero stx ptr4+1 ; Save right operand - jsr popsreg ; Get left operand + jsr popptr1 ; Get left operand (Y=0 by popptr1) -; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX +; Do ptr4:ptr4+1 * ptr1:ptr1+1 --> AX - lda #0 - ldx sreg+1 ; Get high byte into register for speed + tya ; A = 0 + ldx ptr1+1 ; check if lhs is 8 bit only beq @L4 ; -> we can do 8x16 after swap sta tmp1 ldy #16 ; Number of bits @@ -34,12 +34,12 @@ tosumulax: @L0: bcc @L1 clc - adc sreg - pha - txa ; hi byte of left op + adc ptr1 + tax + lda ptr1+1 ; hi byte of left op adc tmp1 sta tmp1 - pla + txa @L1: ror tmp1 ror a @@ -59,9 +59,9 @@ tosumulax: ; If the high byte of rhs is zero, swap the operands and use the 8x16 ; routine. On entry, A and X are zero -@L4: ldy sreg ; Save right operand (8 bit) +@L4: ldy ptr1 ; Save right operand (8 bit) ldx ptr4 ; Copy left 16 bit operand to right - stx sreg + stx ptr1 ldx ptr4+1 ; Don't store, this is done later sty ptr4 ; Copy low 8 bit of right op to left ldy #8 diff --git a/libsrc/runtime/mul8.s b/libsrc/runtime/mul8.s index 9d4dfcbf4..395d64a4c 100644 --- a/libsrc/runtime/mul8.s +++ b/libsrc/runtime/mul8.s @@ -6,8 +6,8 @@ .export tosumula0, tosmula0 .export mul8x16, mul8x16a - .import popsreg - .importzp sreg, ptr4 + .import popptr1 + .importzp ptr1, ptr4 ;--------------------------------------------------------------------------- @@ -16,11 +16,11 @@ tosmula0: tosumula0: sta ptr4 -mul8x16:jsr popsreg ; Get left operand +mul8x16:jsr popptr1 ; Get left operand (Y=0 by popptr1) - lda #0 ; Clear byte 1 + tya ; Clear byte 1 ldy #8 ; Number of bits - ldx sreg+1 ; Get into register for speed + ldx ptr1+1 ; check if lhs is 8 bit only beq mul8x8 ; Do 8x8 multiplication if high byte zero mul8x16a: sta ptr4+1 ; Clear byte 2 @@ -29,12 +29,12 @@ mul8x16a: @L0: bcc @L1 clc - adc sreg - pha - txa ; hi byte of left op + adc ptr1 + tax + lda ptr1+1 ; hi byte of left op adc ptr4+1 sta ptr4+1 - pla + txa @L1: ror ptr4+1 ror a @@ -52,7 +52,7 @@ mul8x8: lsr ptr4 ; Get first bit into carry @L0: bcc @L1 clc - adc sreg + adc ptr1 @L1: ror ror ptr4 dey diff --git a/libsrc/runtime/mulax3.s b/libsrc/runtime/mulax3.s index 472bc60ec..82cc033c3 100644 --- a/libsrc/runtime/mulax3.s +++ b/libsrc/runtime/mulax3.s @@ -15,11 +15,11 @@ rol ptr1+1 clc adc ptr1 - pha + tay txa adc ptr1+1 tax - pla + tya rts .endproc diff --git a/libsrc/runtime/mulax5.s b/libsrc/runtime/mulax5.s index 7e5ed11d9..bf5eaefe8 100644 --- a/libsrc/runtime/mulax5.s +++ b/libsrc/runtime/mulax5.s @@ -17,11 +17,11 @@ rol ptr1+1 clc adc ptr1 - pha + tay txa adc ptr1+1 tax - pla + tya rts .endproc diff --git a/libsrc/runtime/mulax7.s b/libsrc/runtime/mulax7.s index 90313180c..3414ebc9e 100644 --- a/libsrc/runtime/mulax7.s +++ b/libsrc/runtime/mulax7.s @@ -20,12 +20,12 @@ rol ptr1+1 ; * 8 sec sbc ptr1 - pha + tay txa eor #$ff adc ptr1+1 ; * (8 - 1) tax - pla + tya rts .endproc diff --git a/libsrc/runtime/mulax9.s b/libsrc/runtime/mulax9.s index d2dd89529..d175d55aa 100644 --- a/libsrc/runtime/mulax9.s +++ b/libsrc/runtime/mulax9.s @@ -20,11 +20,11 @@ rol ptr1+1 ; * 8 clc adc ptr1 ; * (8+1) - pha + tay txa adc ptr1+1 tax - pla + tya rts .endproc From ba2c6d9008d931cd2fcf83d4e01fa3f2c2972f7b Mon Sep 17 00:00:00 2001 From: IrgendwerA8 Date: Tue, 22 May 2018 15:59:05 +0200 Subject: [PATCH 2/4] Further optimizations in common/conio. --- libsrc/common/ltoa.s | 92 +++++++++++++++++++---------------------- libsrc/common/strcspn.s | 19 ++++----- libsrc/common/strlen.s | 12 +++--- libsrc/common/strspn.s | 19 ++++----- libsrc/conio/scrsize.s | 27 ++++++------ 5 files changed, 81 insertions(+), 88 deletions(-) diff --git a/libsrc/common/ltoa.s b/libsrc/common/ltoa.s index 5dc215bd1..54b693ecc 100644 --- a/libsrc/common/ltoa.s +++ b/libsrc/common/ltoa.s @@ -6,11 +6,11 @@ ; .export _ltoa, _ultoa - .import popax + .import popax, popptr1, negeax .import __hextab, __longminstr .importzp sreg, ptr1, ptr2, ptr3, tmp1 - + .macpack cpu .code @@ -19,17 +19,15 @@ ; dopop: sta tmp1 ; will loose high byte - jsr popax ; get s - sta ptr1 - stx ptr1+1 - sta sreg ; save for return - stx sreg+1 - jsr popax ; get low word of value + jsr popax ; get s to ptr2 sta ptr2 stx ptr2+1 - jsr popax ; get high word of value - sta ptr3 + sta ptr3 ; save for return stx ptr3+1 + jsr popptr1 ; get low word of value to ptr1 + jsr popax ; get high word of value to sreg + sta sreg + stx sreg+1 rts ; @@ -41,20 +39,20 @@ _ltoa: jsr dopop ; pop the arguments ; We must handle $80000000 in a special way, since it is the only negative ; number that has no positive 32-bit counterpart - ldx ptr3+1 ; get high byte + ldx sreg+1 ; get high byte ldy tmp1 ; get radix cpy #10 bne ultoa - lda ptr3 - ora ptr2+1 - ora ptr2 + lda sreg + ora ptr1+1 + ora ptr1 bne L2 cpx #$80 bne L2 ldy #11 L1: lda __longminstr,y ; copy -2147483648 - sta (ptr1),y + sta (ptr2),y dey bpl L1 jmp L10 @@ -65,29 +63,25 @@ L1: lda __longminstr,y ; copy -2147483648 L2: txa ; get high byte bpl ultoa lda #'-' - ldy #0 - sta (ptr1),y ; store sign - inc ptr1 - bne L3 - inc ptr1+1 -L3: lda ptr2 ; negate val - eor #$FF - clc - adc #$01 - sta ptr2 - lda ptr2+1 - eor #$FF - adc #$00 - sta ptr2+1 - lda ptr3 - eor #$FF - adc #$00 - sta ptr3 - lda ptr3+1 - eor #$FF - adc #$00 - sta ptr3+1 +.if (.cpu .bitand CPU_ISET_65SC02) + sta (ptr2) +.else + ldy #0 + sta (ptr2),y ; store sign +.endif + + inc ptr2 + bne L3 + inc ptr2+1 + +L3: lda ptr1 ; negate val + ldx ptr1+1 + + jsr negeax + + sta ptr1 + stx ptr1+1 jmp ultoa ; @@ -105,15 +99,15 @@ ultoa: lda #$00 L5: ldy #32 ; 32 bit lda #0 ; remainder -L6: asl ptr2 - rol ptr2+1 - rol ptr3 - rol ptr3+1 +L6: asl ptr1 + rol ptr1+1 + rol sreg + rol sreg+1 rol a cmp tmp1 bcc L7 sbc tmp1 - inc ptr2 + inc ptr1 L7: dey bne L6 @@ -121,25 +115,25 @@ L7: dey lda __hextab,y ; get hex character pha ; save char value on stack - lda ptr2 - ora ptr2+1 - ora ptr3 - ora ptr3+1 + lda ptr1 + ora ptr1+1 + ora sreg + ora sreg+1 bne L5 ; Get the characters from the stack into the string ldy #0 L9: pla - sta (ptr1),y + sta (ptr2),y beq L10 ; jump if sentinel iny bne L9 ; jump always ; Done! Return the target string -L10: lda sreg - ldx sreg+1 +L10: lda ptr3 + ldx ptr3+1 rts diff --git a/libsrc/common/strcspn.s b/libsrc/common/strcspn.s index c9122dc90..9cf159218 100644 --- a/libsrc/common/strcspn.s +++ b/libsrc/common/strcspn.s @@ -6,40 +6,39 @@ ; .export _strcspn - .import popax, _strlen + .import popptr1, _strlen .importzp ptr1, ptr2, tmp1, tmp2 _strcspn: - jsr _strlen ; get length in a/x and transfer s2 to ptr1 + jsr _strlen ; get length in a/x and transfer s2 to ptr2 ; Note: It does not make sense to ; have more than 255 test chars, so - ; we don't support a high byte here! (ptr1+1 is + ; we don't support a high byte here! (ptr2+1 is ; also unchanged in strlen then (important!)) ; -> the original implementation also ; ignored this case sta tmp1 ; tmp1 = strlen of test chars - jsr popax ; get and save s1 - sta ptr2 ; to ptr2 - stx ptr2+1 + jsr popptr1 ; get and save s1 to ptr1 + ldx #0 ; low counter byte stx tmp2 ; high counter byte loadChar: ldy #0 - lda (ptr2),y ; get next char from s1 + lda (ptr1),y ; get next char from s1 beq leave ; handly byte of s1 advance: - inc ptr2 ; advance string position to test + inc ptr1 ; advance string position to test bne check - inc ptr2+1 + inc ptr1+1 dey ; correct next iny (faster/shorter than bne...) checkNext: iny check: cpy tmp1 ; compare with length of test character string beq endOfTestChars - cmp (ptr1),y ; found matching char? + cmp (ptr2),y ; found matching char? bne checkNext leave: txa ; restore position of finding diff --git a/libsrc/common/strlen.s b/libsrc/common/strlen.s index 1a51edb11..e89039179 100644 --- a/libsrc/common/strlen.s +++ b/libsrc/common/strlen.s @@ -2,26 +2,26 @@ ; Ullrich von Bassewitz, 31.05.1998 ; ; Note: strspn & strcspn call internally this function and rely on -; the usage of only ptr1 here! Keep in mind when appling changes +; the usage of only ptr2 here! Keep in mind when appling changes ; and check the other implementations too! ; ; int strlen (const char* s); ; .export _strlen - .importzp ptr1 + .importzp ptr2 _strlen: - sta ptr1 ; Save s - stx ptr1+1 + sta ptr2 ; Save s + stx ptr2+1 ldx #0 ; YX used as counter ldy #0 -L1: lda (ptr1),y +L1: lda (ptr2),y beq L9 iny bne L1 - inc ptr1+1 + inc ptr2+1 inx bne L1 diff --git a/libsrc/common/strspn.s b/libsrc/common/strspn.s index 079b935ee..6fda716be 100644 --- a/libsrc/common/strspn.s +++ b/libsrc/common/strspn.s @@ -6,40 +6,39 @@ ; .export _strspn - .import popax, _strlen + .import popptr1, _strlen .importzp ptr1, ptr2, tmp1, tmp2 _strspn: - jsr _strlen ; get length in a/x and transfer s2 to ptr1 + jsr _strlen ; get length in a/x and transfer s2 to ptr2 ; Note: It does not make sense to ; have more than 255 test chars, so - ; we don't support a high byte here! (ptr1+1 is + ; we don't support a high byte here! (ptr2+1 is ; also unchanged in strlen then (important!)) ; -> the original implementation also ; ignored this case sta tmp1 ; tmp1 = strlen of test chars - jsr popax ; get and save s1 - sta ptr2 ; to ptr2 - stx ptr2+1 + jsr popptr1 ; get and save s1 to ptr1 + ldx #0 ; low counter byte stx tmp2 ; high counter byte loadChar: ldy #0 - lda (ptr2),y ; get next char from s1 + lda (ptr1),y ; get next char from s1 beq leave ; handly byte of s1 advance: - inc ptr2 ; advance string position to test + inc ptr1 ; advance string position to test bne check - inc ptr2+1 + inc ptr1+1 dey ; correct next iny (faster/shorter than bne...) checkNext: iny check: cpy tmp1 ; compare with length of test character string beq leave - cmp (ptr1),y ; found matching char? + cmp (ptr2),y ; found matching char? bne checkNext foundTestChar: diff --git a/libsrc/conio/scrsize.s b/libsrc/conio/scrsize.s index 6582568d7..014b6f08b 100644 --- a/libsrc/conio/scrsize.s +++ b/libsrc/conio/scrsize.s @@ -6,29 +6,30 @@ .export _screensize - .import popsreg + .import popptr1 .import screensize - .importzp ptr1, sreg + .importzp ptr1, ptr2 + + .macpack cpu .proc _screensize - sta ptr1 ; Store the y pointer - stx ptr1+1 - jsr popsreg ; Get the x pointer into sreg + sta ptr2 ; Store the y pointer + stx ptr2+1 + jsr popptr1 ; Get the x pointer into ptr1 jsr screensize ; Get screensize into X/Y tya ; Get Y size into A -.IFP02 - ldy #0 - sta (ptr1),y +.if (.cpu .bitand ::CPU_ISET_65SC02) + sta (ptr2) txa - sta (sreg),y -.ELSE sta (ptr1) +.else + ldy #0 + sta (ptr2),y txa - sta (sreg) -.ENDIF - + sta (ptr1),y +.endif rts .endproc From 808d3ab4714c2bf393b0befddafe2a7ea7897289 Mon Sep 17 00:00:00 2001 From: IrgendwerA8 Date: Tue, 22 May 2018 18:35:05 +0200 Subject: [PATCH 3/4] Fix for 8x16 multiplication if operants are swapped. --- libsrc/runtime/mul.s | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libsrc/runtime/mul.s b/libsrc/runtime/mul.s index a9b53293b..769cf9d8e 100644 --- a/libsrc/runtime/mul.s +++ b/libsrc/runtime/mul.s @@ -62,7 +62,8 @@ tosumulax: @L4: ldy ptr1 ; Save right operand (8 bit) ldx ptr4 ; Copy left 16 bit operand to right stx ptr1 - ldx ptr4+1 ; Don't store, this is done later + ldx ptr4+1 ; swap high-byte too + stx ptr1+1 sty ptr4 ; Copy low 8 bit of right op to left ldy #8 jmp mul8x16a From 6175271651d769610d5aa3fbd46b6ad61ee56056 Mon Sep 17 00:00:00 2001 From: IrgendwerA8 Date: Tue, 22 May 2018 19:10:07 +0200 Subject: [PATCH 4/4] Removed optimizations which break the compiler ones. --- libsrc/runtime/mulax3.s | 4 ++-- libsrc/runtime/mulax5.s | 5 +++-- libsrc/runtime/mulax7.s | 5 +++-- libsrc/runtime/mulax9.s | 5 +++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/libsrc/runtime/mulax3.s b/libsrc/runtime/mulax3.s index 82cc033c3..472bc60ec 100644 --- a/libsrc/runtime/mulax3.s +++ b/libsrc/runtime/mulax3.s @@ -15,11 +15,11 @@ rol ptr1+1 clc adc ptr1 - tay + pha txa adc ptr1+1 tax - tya + pla rts .endproc diff --git a/libsrc/runtime/mulax5.s b/libsrc/runtime/mulax5.s index bf5eaefe8..99f0ffce0 100644 --- a/libsrc/runtime/mulax5.s +++ b/libsrc/runtime/mulax5.s @@ -3,6 +3,7 @@ ; ; CC65 runtime: Multiply the primary register by 5 ; +; Don't touch the Y-register here, the optimizer relies on it! .export mulax5 .importzp ptr1 @@ -17,11 +18,11 @@ rol ptr1+1 clc adc ptr1 - tay + pha txa adc ptr1+1 tax - tya + pla rts .endproc diff --git a/libsrc/runtime/mulax7.s b/libsrc/runtime/mulax7.s index 3414ebc9e..6f2b2bf61 100644 --- a/libsrc/runtime/mulax7.s +++ b/libsrc/runtime/mulax7.s @@ -4,6 +4,7 @@ ; ; CC65 runtime: Multiply the primary register by 7 ; +; Don't touch the Y-register here, the optimizer relies on it! .export mulax7 .importzp ptr1 @@ -20,12 +21,12 @@ rol ptr1+1 ; * 8 sec sbc ptr1 - tay + pha txa eor #$ff adc ptr1+1 ; * (8 - 1) tax - tya + pla rts .endproc diff --git a/libsrc/runtime/mulax9.s b/libsrc/runtime/mulax9.s index d175d55aa..064eb458b 100644 --- a/libsrc/runtime/mulax9.s +++ b/libsrc/runtime/mulax9.s @@ -4,6 +4,7 @@ ; ; CC65 runtime: Multiply the primary register by 9 ; +; Don't touch the Y-register here, the optimizer relies on it! .export mulax9 .importzp ptr1 @@ -20,11 +21,11 @@ rol ptr1+1 ; * 8 clc adc ptr1 ; * (8+1) - tay + pha txa adc ptr1+1 tax - tya + pla rts .endproc