1
0
mirror of https://github.com/cc65/cc65.git synced 2024-06-08 15:29:37 +00:00

Merge pull request #657 from IrgendwerA8/VariousSpeedSizeOptimizations

Various speed size optimizations
This commit is contained in:
Oliver Schmidt 2018-05-23 09:46:30 +02:00 committed by GitHub
commit f485be1b84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 108 additions and 111 deletions

View File

@ -6,11 +6,11 @@
;
.export _ltoa, _ultoa
.import popax
.import popax, popptr1, negeax
.import __hextab, __longminstr
.importzp sreg, ptr1, ptr2, ptr3, tmp1
.macpack cpu
.code
@ -19,17 +19,15 @@
;
dopop: sta tmp1 ; will loose high byte
jsr popax ; get s
sta ptr1
stx ptr1+1
sta sreg ; save for return
stx sreg+1
jsr popax ; get low word of value
jsr popax ; get s to ptr2
sta ptr2
stx ptr2+1
jsr popax ; get high word of value
sta ptr3
sta ptr3 ; save for return
stx ptr3+1
jsr popptr1 ; get low word of value to ptr1
jsr popax ; get high word of value to sreg
sta sreg
stx sreg+1
rts
;
@ -41,20 +39,20 @@ _ltoa: jsr dopop ; pop the arguments
; We must handle $80000000 in a special way, since it is the only negative
; number that has no positive 32-bit counterpart
ldx ptr3+1 ; get high byte
ldx sreg+1 ; get high byte
ldy tmp1 ; get radix
cpy #10
bne ultoa
lda ptr3
ora ptr2+1
ora ptr2
lda sreg
ora ptr1+1
ora ptr1
bne L2
cpx #$80
bne L2
ldy #11
L1: lda __longminstr,y ; copy -2147483648
sta (ptr1),y
sta (ptr2),y
dey
bpl L1
jmp L10
@ -65,29 +63,25 @@ L1: lda __longminstr,y ; copy -2147483648
L2: txa ; get high byte
bpl ultoa
lda #'-'
ldy #0
sta (ptr1),y ; store sign
inc ptr1
bne L3
inc ptr1+1
L3: lda ptr2 ; negate val
eor #$FF
clc
adc #$01
sta ptr2
lda ptr2+1
eor #$FF
adc #$00
sta ptr2+1
lda ptr3
eor #$FF
adc #$00
sta ptr3
lda ptr3+1
eor #$FF
adc #$00
sta ptr3+1
.if (.cpu .bitand CPU_ISET_65SC02)
sta (ptr2)
.else
ldy #0
sta (ptr2),y ; store sign
.endif
inc ptr2
bne L3
inc ptr2+1
L3: lda ptr1 ; negate val
ldx ptr1+1
jsr negeax
sta ptr1
stx ptr1+1
jmp ultoa
;
@ -105,15 +99,15 @@ ultoa: lda #$00
L5: ldy #32 ; 32 bit
lda #0 ; remainder
L6: asl ptr2
rol ptr2+1
rol ptr3
rol ptr3+1
L6: asl ptr1
rol ptr1+1
rol sreg
rol sreg+1
rol a
cmp tmp1
bcc L7
sbc tmp1
inc ptr2
inc ptr1
L7: dey
bne L6
@ -121,25 +115,25 @@ L7: dey
lda __hextab,y ; get hex character
pha ; save char value on stack
lda ptr2
ora ptr2+1
ora ptr3
ora ptr3+1
lda ptr1
ora ptr1+1
ora sreg
ora sreg+1
bne L5
; Get the characters from the stack into the string
ldy #0
L9: pla
sta (ptr1),y
sta (ptr2),y
beq L10 ; jump if sentinel
iny
bne L9 ; jump always
; Done! Return the target string
L10: lda sreg
ldx sreg+1
L10: lda ptr3
ldx ptr3+1
rts

View File

@ -6,40 +6,39 @@
;
.export _strcspn
.import popax, _strlen
.import popptr1, _strlen
.importzp ptr1, ptr2, tmp1, tmp2
_strcspn:
jsr _strlen ; get length in a/x and transfer s2 to ptr1
jsr _strlen ; get length in a/x and transfer s2 to ptr2
; Note: It does not make sense to
; have more than 255 test chars, so
; we don't support a high byte here! (ptr1+1 is
; we don't support a high byte here! (ptr2+1 is
; also unchanged in strlen then (important!))
; -> the original implementation also
; ignored this case
sta tmp1 ; tmp1 = strlen of test chars
jsr popax ; get and save s1
sta ptr2 ; to ptr2
stx ptr2+1
jsr popptr1 ; get and save s1 to ptr1
ldx #0 ; low counter byte
stx tmp2 ; high counter byte
loadChar:
ldy #0
lda (ptr2),y ; get next char from s1
lda (ptr1),y ; get next char from s1
beq leave ; handly byte of s1
advance:
inc ptr2 ; advance string position to test
inc ptr1 ; advance string position to test
bne check
inc ptr2+1
inc ptr1+1
dey ; correct next iny (faster/shorter than bne...)
checkNext:
iny
check: cpy tmp1 ; compare with length of test character string
beq endOfTestChars
cmp (ptr1),y ; found matching char?
cmp (ptr2),y ; found matching char?
bne checkNext
leave: txa ; restore position of finding

View File

@ -2,26 +2,26 @@
; Ullrich von Bassewitz, 31.05.1998
;
; Note: strspn & strcspn call internally this function and rely on
; the usage of only ptr1 here! Keep in mind when appling changes
; the usage of only ptr2 here! Keep in mind when appling changes
; and check the other implementations too!
;
; int strlen (const char* s);
;
.export _strlen
.importzp ptr1
.importzp ptr2
_strlen:
sta ptr1 ; Save s
stx ptr1+1
sta ptr2 ; Save s
stx ptr2+1
ldx #0 ; YX used as counter
ldy #0
L1: lda (ptr1),y
L1: lda (ptr2),y
beq L9
iny
bne L1
inc ptr1+1
inc ptr2+1
inx
bne L1

View File

@ -6,40 +6,39 @@
;
.export _strspn
.import popax, _strlen
.import popptr1, _strlen
.importzp ptr1, ptr2, tmp1, tmp2
_strspn:
jsr _strlen ; get length in a/x and transfer s2 to ptr1
jsr _strlen ; get length in a/x and transfer s2 to ptr2
; Note: It does not make sense to
; have more than 255 test chars, so
; we don't support a high byte here! (ptr1+1 is
; we don't support a high byte here! (ptr2+1 is
; also unchanged in strlen then (important!))
; -> the original implementation also
; ignored this case
sta tmp1 ; tmp1 = strlen of test chars
jsr popax ; get and save s1
sta ptr2 ; to ptr2
stx ptr2+1
jsr popptr1 ; get and save s1 to ptr1
ldx #0 ; low counter byte
stx tmp2 ; high counter byte
loadChar:
ldy #0
lda (ptr2),y ; get next char from s1
lda (ptr1),y ; get next char from s1
beq leave ; handly byte of s1
advance:
inc ptr2 ; advance string position to test
inc ptr1 ; advance string position to test
bne check
inc ptr2+1
inc ptr1+1
dey ; correct next iny (faster/shorter than bne...)
checkNext:
iny
check: cpy tmp1 ; compare with length of test character string
beq leave
cmp (ptr1),y ; found matching char?
cmp (ptr2),y ; found matching char?
bne checkNext
foundTestChar:

View File

@ -6,29 +6,30 @@
.export _screensize
.import popsreg
.import popptr1
.import screensize
.importzp ptr1, sreg
.importzp ptr1, ptr2
.macpack cpu
.proc _screensize
sta ptr1 ; Store the y pointer
stx ptr1+1
jsr popsreg ; Get the x pointer into sreg
sta ptr2 ; Store the y pointer
stx ptr2+1
jsr popptr1 ; Get the x pointer into ptr1
jsr screensize ; Get screensize into X/Y
tya ; Get Y size into A
.IFP02
ldy #0
sta (ptr1),y
.if (.cpu .bitand ::CPU_ISET_65SC02)
sta (ptr2)
txa
sta (sreg),y
.ELSE
sta (ptr1)
.else
ldy #0
sta (ptr2),y
txa
sta (sreg)
.ENDIF
sta (ptr1),y
.endif
rts
.endproc

View File

@ -6,8 +6,8 @@
.export tosumulax, tosmulax
.import mul8x16, mul8x16a ; in mul8.s
.import popsreg
.importzp sreg, tmp1, ptr4
.import popptr1
.importzp tmp1, ptr1, ptr4
;---------------------------------------------------------------------------
@ -19,12 +19,12 @@ tosumulax:
txa ; High byte zero
beq @L3 ; Do 8x16 multiplication if high byte zero
stx ptr4+1 ; Save right operand
jsr popsreg ; Get left operand
jsr popptr1 ; Get left operand (Y=0 by popptr1)
; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX
; Do ptr4:ptr4+1 * ptr1:ptr1+1 --> AX
lda #0
ldx sreg+1 ; Get high byte into register for speed
tya ; A = 0
ldx ptr1+1 ; check if lhs is 8 bit only
beq @L4 ; -> we can do 8x16 after swap
sta tmp1
ldy #16 ; Number of bits
@ -34,12 +34,12 @@ tosumulax:
@L0: bcc @L1
clc
adc sreg
pha
txa ; hi byte of left op
adc ptr1
tax
lda ptr1+1 ; hi byte of left op
adc tmp1
sta tmp1
pla
txa
@L1: ror tmp1
ror a
@ -59,10 +59,11 @@ tosumulax:
; If the high byte of rhs is zero, swap the operands and use the 8x16
; routine. On entry, A and X are zero
@L4: ldy sreg ; Save right operand (8 bit)
@L4: ldy ptr1 ; Save right operand (8 bit)
ldx ptr4 ; Copy left 16 bit operand to right
stx sreg
ldx ptr4+1 ; Don't store, this is done later
stx ptr1
ldx ptr4+1 ; swap high-byte too
stx ptr1+1
sty ptr4 ; Copy low 8 bit of right op to left
ldy #8
jmp mul8x16a

View File

@ -6,8 +6,8 @@
.export tosumula0, tosmula0
.export mul8x16, mul8x16a
.import popsreg
.importzp sreg, ptr4
.import popptr1
.importzp ptr1, ptr4
;---------------------------------------------------------------------------
@ -16,11 +16,11 @@
tosmula0:
tosumula0:
sta ptr4
mul8x16:jsr popsreg ; Get left operand
mul8x16:jsr popptr1 ; Get left operand (Y=0 by popptr1)
lda #0 ; Clear byte 1
tya ; Clear byte 1
ldy #8 ; Number of bits
ldx sreg+1 ; Get into register for speed
ldx ptr1+1 ; check if lhs is 8 bit only
beq mul8x8 ; Do 8x8 multiplication if high byte zero
mul8x16a:
sta ptr4+1 ; Clear byte 2
@ -29,12 +29,12 @@ mul8x16a:
@L0: bcc @L1
clc
adc sreg
pha
txa ; hi byte of left op
adc ptr1
tax
lda ptr1+1 ; hi byte of left op
adc ptr4+1
sta ptr4+1
pla
txa
@L1: ror ptr4+1
ror a
@ -52,7 +52,7 @@ mul8x8:
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc sreg
adc ptr1
@L1: ror
ror ptr4
dey

View File

@ -3,6 +3,7 @@
;
; CC65 runtime: Multiply the primary register by 5
;
; Don't touch the Y-register here, the optimizer relies on it!
.export mulax5
.importzp ptr1

View File

@ -4,6 +4,7 @@
;
; CC65 runtime: Multiply the primary register by 7
;
; Don't touch the Y-register here, the optimizer relies on it!
.export mulax7
.importzp ptr1

View File

@ -4,6 +4,7 @@
;
; CC65 runtime: Multiply the primary register by 9
;
; Don't touch the Y-register here, the optimizer relies on it!
.export mulax9
.importzp ptr1