1
0
mirror of https://github.com/cc65/cc65.git synced 2025-01-10 19:29:45 +00:00

Greatly improved multiplication routine. Optimized the generic 16x16 one and

added special cases for 8x16 and 8x8. The former is directly called by the
compiler as tosmula0 and tosumula0 resp.


git-svn-id: svn://svn.cc65.org/cc65/trunk@4036 b7a2c559-68d2-44c3-8de9-860c34a00d81
This commit is contained in:
uz 2009-08-17 20:48:28 +00:00
parent b5a44f9542
commit acd1233bf2
3 changed files with 121 additions and 31 deletions

View File

@ -152,6 +152,7 @@ OBJS = add.o \
makebool.o \ makebool.o \
mod.o \ mod.o \
mul.o \ mul.o \
mul8.o \
mulax3.o \ mulax3.o \
mulax5.o \ mulax5.o \
mulax6.o \ mulax6.o \

View File

@ -1,43 +1,69 @@
; ;
; Ullrich von Bassewitz, 07.08.1998 ; Ullrich von Bassewitz, 2009-08-17
; ;
; CC65 runtime: multiplication for ints ; CC65 runtime: multiplication for ints
; ;
.export tosumula0, tosumulax, tosmula0, tosmulax .export tosumulax, tosmulax
.import popsreg .import mul8x16, mul8x16a ; in mul8.s
.importzp sreg, tmp1, ptr4 .import popsreg
.importzp sreg, tmp1, ptr4
;---------------------------------------------------------------------------
; 16x16 multiplication routine
tosmula0:
tosumula0:
ldx #0
tosmulax: tosmulax:
tosumulax: tosumulax:
mul16: sta ptr4 sta ptr4
stx ptr4+1 ; Save right operand txa ; High byte zero
jsr popsreg ; Get left operand beq @L3 ; Do 8x16 multiplication if high byte zero
stx ptr4+1 ; Save right operand
jsr popsreg ; Get left operand
; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge"). ; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX
lda #0 lda #0
sta tmp1 ldx sreg+1 ; Get high byte into register for speed
ldx sreg+1 ; Get into register for speed beq @L4 ; -> we can do 8x16 after swap
ldy #16 ; Number of bits sta tmp1
L0: lsr tmp1 ldy #16 ; Number of bits
ror a
ror ptr4+1 lsr ptr4+1
ror ptr4 ror ptr4 ; Get first bit into carry
bcc L1 @L0: bcc @L1
clc
adc sreg clc
pha adc sreg
pha
txa ; hi byte of left op txa ; hi byte of left op
adc tmp1 adc tmp1
sta tmp1 sta tmp1
pla pla
L1: dey
bpl L0 @L1: ror tmp1
lda ptr4 ; Load the result ror a
ldx ptr4+1 ror ptr4+1
rts ; Done ror ptr4
dey
bne @L0
lda ptr4 ; Load the result
ldx ptr4+1
rts ; Done
; High byte of rhs is zero, jump to the 8x16 routine instead
@L3: jmp mul8x16
; If the high byte of rhs is zero, swap the operands and use the 8x16
; routine. On entry, A and X are zero
@L4: ldy sreg ; Save right operand (8 bit)
ldx ptr4 ; Copy left 16 bit operand to right
stx sreg
ldx ptr4+1 ; Don't store, this is done later
sty ptr4 ; Copy low 8 bit of right op to left
ldy #8
jmp mul8x16a

63
libsrc/runtime/mul8.s Normal file
View File

@ -0,0 +1,63 @@
;
; Ullrich von Bassewitz, 2009-08-17
;
; CC65 runtime: multiplication for ints. Short versions.
;
.export tosumula0, tosmula0
.export mul8x16, mul8x16a
.import popsreg
.importzp sreg, ptr4
;---------------------------------------------------------------------------
; 8x16 routine with external entry points used by the 16x16 routine in mul.s
tosmula0:
tosumula0:
sta ptr4
mul8x16:jsr popsreg ; Get left operand
lda #0 ; Clear byte 1
ldy #8 ; Number of bits
ldx sreg+1 ; Get into register for speed
beq mul8x8 ; Do 8x8 multiplication if high byte zero
mul8x16a:
sta ptr4+1 ; Clear byte 2
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc sreg
pha
txa ; hi byte of left op
adc ptr4+1
sta ptr4+1
pla
@L1: ror ptr4+1
ror a
ror ptr4
dey
bne @L0
tax
lda ptr4 ; Load the result
rts
;---------------------------------------------------------------------------
; 8x8 multiplication routine
mul8x8:
lsr ptr4 ; Get first bit into carry
@L0: bcc @L1
clc
adc sreg
@L1: ror
ror ptr4
dey
bne @L0
tax
lda ptr4 ; Load the result
rts ; Done