Greatly improved multiplication routine. Optimized the generic 16x16 one and

added special cases for 8x16 and 8x8. The former is directly called by the compiler as tosmula0 and tosumula0 resp. git-svn-id: svn://svn.cc65.org/cc65/trunk@4036 b7a2c559-68d2-44c3-8de9-860c34a00d81
2026-04-20 02:17:07 +00:00 · 2009-08-17 20:48:28 +00:00
parent b5a44f9542
commit acd1233bf2
3 changed files with 121 additions and 31 deletions
@@ -152,6 +152,7 @@ OBJS = 	add.o		\
       	makebool.o	\
       	mod.o  		\
       	mul.o  		\
+        mul8.o          \
 	mulax3.o	\
 	mulax5.o	\
 	mulax6.o	\
@@ -1,43 +1,69 @@
 ;
-; Ullrich von Bassewitz, 07.08.1998
+; Ullrich von Bassewitz, 2009-08-17
 ;
 ; CC65 runtime: multiplication for ints
 ;

-       	.export		tosumula0, tosumulax, tosmula0, tosmulax
-	.import		popsreg
-	.importzp	sreg, tmp1, ptr4
+       	.export		tosumulax, tosmulax
+        .import         mul8x16, mul8x16a       ; in mul8.s
+    	.import		popsreg
+    	.importzp	sreg, tmp1, ptr4
+
+
+;---------------------------------------------------------------------------
+; 16x16 multiplication routine

-tosmula0:
-tosumula0:
-	ldx	#0
 tosmulax:
 tosumulax:
-mul16:	sta	ptr4
-      	stx	ptr4+1 	       	; Save right operand
-      	jsr	popsreg	       	; Get left operand
+        sta	ptr4
+        txa                     ; High byte zero
+        beq     @L3             ; Do 8x16 multiplication if high byte zero
+       	stx	ptr4+1 	       	; Save right operand
+       	jsr	popsreg	       	; Get left operand

-; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge").
+; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX

-	lda	#0
-   	sta	tmp1
-	ldx	sreg+1	       	; Get into register for speed
-   	ldy    	#16 	       	; Number of bits
-L0:	lsr	tmp1
-   	ror	a
-   	ror	ptr4+1
-   	ror	ptr4
-   	bcc	L1
-   	clc
-   	adc	sreg
-   	pha
+       	lda	#0
+       	ldx	sreg+1	       	; Get high byte into register for speed
+        beq     @L4             ; -> we can do 8x16 after swap
+       	sta	tmp1
+       	ldy    	#16 	       	; Number of bits
+
+        lsr     ptr4+1
+        ror     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+      	clc
+      	adc	sreg
+      	pha
       	txa	    	       	; hi byte of left op
-   	adc	tmp1
-   	sta	tmp1
-   	pla
-L1:	dey
-       	bpl    	L0
-	lda	ptr4	       	; Load the result
-	ldx	ptr4+1
-	rts			; Done
+      	adc	tmp1
+      	sta	tmp1
+      	pla
+
+@L1:    ror     tmp1
+     	ror	a
+     	ror	ptr4+1
+     	ror	ptr4
+        dey
+        bne     @L0
+
+      	lda	ptr4	       	; Load the result
+      	ldx	ptr4+1
+      	rts	    		; Done
+
+; High byte of rhs is zero, jump to the 8x16 routine instead
+
+@L3:    jmp     mul8x16
+
+; If the high byte of rhs is zero, swap the operands and use the 8x16
+; routine. On entry, A and X are zero
+
+@L4:    ldy     sreg            ; Save right operand (8 bit)
+        ldx     ptr4            ; Copy left 16 bit operand to right
+        stx     sreg
+        ldx     ptr4+1          ; Don't store, this is done later
+        sty     ptr4            ; Copy low 8 bit of right op to left
+        ldy     #8
+        jmp     mul8x16a

@@ -0,0 +1,63 @@
+;
+; Ullrich von Bassewitz, 2009-08-17
+;
+; CC65 runtime: multiplication for ints. Short versions.
+;
+
+       	.export		tosumula0, tosmula0
+        .export         mul8x16, mul8x16a
+    	.import		popsreg
+    	.importzp	sreg, ptr4
+
+
+;---------------------------------------------------------------------------
+; 8x16 routine with external entry points used by the 16x16 routine in mul.s
+
+tosmula0:
+tosumula0:
+        sta   	ptr4
+mul8x16:jsr   	popsreg	       	; Get left operand
+
+    	lda   	#0              ; Clear byte 1
+       	ldy    	#8    	       	; Number of bits
+    	ldx   	sreg+1	       	; Get into register for speed
+        beq     mul8x8          ; Do 8x8 multiplication if high byte zero
+mul8x16a:
+    	sta   	ptr4+1          ; Clear byte 2
+
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+      	clc
+      	adc   	sreg
+      	pha
+       	txa   	    	       	; hi byte of left op
+      	adc   	ptr4+1
+      	sta   	ptr4+1
+      	pla
+
+@L1:    ror    	ptr4+1
+      	ror   	a
+      	ror   	ptr4
+        dey
+        bne     @L0
+        tax
+        lda     ptr4            ; Load the result
+        rts
+
+;---------------------------------------------------------------------------
+; 8x8 multiplication routine
+
+mul8x8:
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+        clc
+        adc     sreg
+@L1:    ror
+        ror     ptr4
+        dey
+        bne     @L0
+        tax
+    	lda	ptr4  	       	; Load the result
+    	rts	   		; Done
+