1
0
mirror of https://github.com/cc65/cc65.git synced 2024-09-29 17:56:21 +00:00

Greatly improved multiplication routine. Optimized the generic 16x16 one and

added special cases for 8x16 and 8x8. The former is directly called by the
compiler as tosmula0 and tosumula0 resp.


git-svn-id: svn://svn.cc65.org/cc65/trunk@4036 b7a2c559-68d2-44c3-8de9-860c34a00d81
This commit is contained in:
uz 2009-08-17 20:48:28 +00:00
parent b5a44f9542
commit acd1233bf2
3 changed files with 121 additions and 31 deletions

View File

@ -152,6 +152,7 @@ OBJS = add.o \
makebool.o \
mod.o \
mul.o \
mul8.o \
mulax3.o \
mulax5.o \
mulax6.o \

View File

@ -1,43 +1,69 @@
;
; Ullrich von Bassewitz, 07.08.1998
; Ullrich von Bassewitz, 2009-08-17
;
; CC65 runtime: multiplication for ints
;
.export tosumula0, tosumulax, tosmula0, tosmulax
.import popsreg
.importzp sreg, tmp1, ptr4
.export tosumulax, tosmulax
.import mul8x16, mul8x16a ; in mul8.s
.import popsreg
.importzp sreg, tmp1, ptr4
;---------------------------------------------------------------------------
; 16x16 multiplication routine
tosmula0:
tosumula0:
ldx #0
tosmulax:
tosumulax:
mul16: sta ptr4
stx ptr4+1 ; Save right operand
jsr popsreg ; Get left operand
sta ptr4
txa ; High byte zero
beq @L3 ; Do 8x16 multiplication if high byte zero
stx ptr4+1 ; Save right operand
jsr popsreg ; Get left operand
; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge").
; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX
lda #0
sta tmp1
ldx sreg+1 ; Get into register for speed
ldy #16 ; Number of bits
L0: lsr tmp1
ror a
ror ptr4+1
ror ptr4
bcc L1
clc
adc sreg
pha
lda #0
ldx sreg+1 ; Get high byte into register for speed
beq @L4 ; -> we can do 8x16 after swap
sta tmp1
ldy #16 ; Number of bits
lsr ptr4+1
ror ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc sreg
pha
txa ; hi byte of left op
adc tmp1
sta tmp1
pla
L1: dey
bpl L0
lda ptr4 ; Load the result
ldx ptr4+1
rts ; Done
adc tmp1
sta tmp1
pla
@L1: ror tmp1
ror a
ror ptr4+1
ror ptr4
dey
bne @L0
lda ptr4 ; Load the result
ldx ptr4+1
rts ; Done
; High byte of rhs is zero, jump to the 8x16 routine instead
@L3: jmp mul8x16
; If the high byte of rhs is zero, swap the operands and use the 8x16
; routine. On entry, A and X are zero
@L4: ldy sreg ; Save right operand (8 bit)
ldx ptr4 ; Copy left 16 bit operand to right
stx sreg
ldx ptr4+1 ; Don't store, this is done later
sty ptr4 ; Copy low 8 bit of right op to left
ldy #8
jmp mul8x16a

63
libsrc/runtime/mul8.s Normal file
View File

@ -0,0 +1,63 @@
;
; Ullrich von Bassewitz, 2009-08-17
;
; CC65 runtime: multiplication for ints. Short versions.
;
.export tosumula0, tosmula0
.export mul8x16, mul8x16a
.import popsreg
.importzp sreg, ptr4
;---------------------------------------------------------------------------
; 8x16 routine with external entry points used by the 16x16 routine in mul.s
tosmula0:
tosumula0:
sta ptr4
mul8x16:jsr popsreg ; Get left operand
lda #0 ; Clear byte 1
ldy #8 ; Number of bits
ldx sreg+1 ; Get into register for speed
beq mul8x8 ; Do 8x8 multiplication if high byte zero
mul8x16a:
sta ptr4+1 ; Clear byte 2
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc sreg
pha
txa ; hi byte of left op
adc ptr4+1
sta ptr4+1
pla
@L1: ror ptr4+1
ror a
ror ptr4
dey
bne @L0
tax
lda ptr4 ; Load the result
rts
;---------------------------------------------------------------------------
; 8x8 multiplication routine
mul8x8:
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc sreg
@L1: ror
ror ptr4
dey
bne @L0
tax
lda ptr4 ; Load the result
rts ; Done