diff --git a/libsrc/runtime/Makefile b/libsrc/runtime/Makefile
index 14158398e..10a5f0e8d 100644
--- a/libsrc/runtime/Makefile
+++ b/libsrc/runtime/Makefile
@@ -152,6 +152,7 @@ OBJS = 	add.o		\
        	makebool.o	\
        	mod.o  		\
        	mul.o  		\
+        mul8.o          \
 	mulax3.o	\
 	mulax5.o	\
 	mulax6.o	\
diff --git a/libsrc/runtime/mul.s b/libsrc/runtime/mul.s
index 67caf52cf..eaf1fb97b 100644
--- a/libsrc/runtime/mul.s
+++ b/libsrc/runtime/mul.s
@@ -1,43 +1,69 @@
 ;
-; Ullrich von Bassewitz, 07.08.1998
+; Ullrich von Bassewitz, 2009-08-17
 ;
 ; CC65 runtime: multiplication for ints
 ;
 
-       	.export		tosumula0, tosumulax, tosmula0, tosmulax
-	.import		popsreg
-	.importzp	sreg, tmp1, ptr4
+       	.export		tosumulax, tosmulax
+        .import         mul8x16, mul8x16a       ; in mul8.s
+    	.import		popsreg
+    	.importzp	sreg, tmp1, ptr4
+
+
+;---------------------------------------------------------------------------
+; 16x16 multiplication routine
 
-tosmula0:
-tosumula0:
-	ldx	#0
 tosmulax:
 tosumulax:
-mul16:	sta	ptr4
-      	stx	ptr4+1 	       	; Save right operand
-      	jsr	popsreg	       	; Get left operand
+        sta	ptr4
+        txa                     ; High byte zero
+        beq     @L3             ; Do 8x16 multiplication if high byte zero
+       	stx	ptr4+1 	       	; Save right operand
+       	jsr	popsreg	       	; Get left operand
 
-; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge").
+; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX
 
-	lda	#0
-   	sta	tmp1
-	ldx	sreg+1	       	; Get into register for speed
-   	ldy    	#16 	       	; Number of bits
-L0:	lsr	tmp1
-   	ror	a
-   	ror	ptr4+1
-   	ror	ptr4
-   	bcc	L1
-   	clc
-   	adc	sreg
-   	pha
+       	lda	#0
+       	ldx	sreg+1	       	; Get high byte into register for speed
+        beq     @L4             ; -> we can do 8x16 after swap
+       	sta	tmp1
+       	ldy    	#16 	       	; Number of bits
+
+        lsr     ptr4+1
+        ror     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+      	clc
+      	adc	sreg
+      	pha
        	txa	    	       	; hi byte of left op
-   	adc	tmp1
-   	sta	tmp1
-   	pla
-L1:	dey
-       	bpl    	L0
-	lda	ptr4	       	; Load the result
-	ldx	ptr4+1
-	rts			; Done
+      	adc	tmp1
+      	sta	tmp1
+      	pla
+
+@L1:    ror     tmp1
+     	ror	a
+     	ror	ptr4+1
+     	ror	ptr4
+        dey
+        bne     @L0
+
+      	lda	ptr4	       	; Load the result
+      	ldx	ptr4+1
+      	rts	    		; Done
+
+; High byte of rhs is zero, jump to the 8x16 routine instead
+
+@L3:    jmp     mul8x16
+
+; If the high byte of rhs is zero, swap the operands and use the 8x16
+; routine. On entry, A and X are zero
+
+@L4:    ldy     sreg            ; Save right operand (8 bit)
+        ldx     ptr4            ; Copy left 16 bit operand to right
+        stx     sreg
+        ldx     ptr4+1          ; Don't store, this is done later
+        sty     ptr4            ; Copy low 8 bit of right op to left
+        ldy     #8
+        jmp     mul8x16a
 
diff --git a/libsrc/runtime/mul8.s b/libsrc/runtime/mul8.s
new file mode 100644
index 000000000..3287e2155
--- /dev/null
+++ b/libsrc/runtime/mul8.s
@@ -0,0 +1,63 @@
+;
+; Ullrich von Bassewitz, 2009-08-17
+;
+; CC65 runtime: multiplication for ints. Short versions.
+;
+
+       	.export		tosumula0, tosmula0
+        .export         mul8x16, mul8x16a
+    	.import		popsreg
+    	.importzp	sreg, ptr4
+
+
+;---------------------------------------------------------------------------
+; 8x16 routine with external entry points used by the 16x16 routine in mul.s
+
+tosmula0:
+tosumula0:
+        sta   	ptr4
+mul8x16:jsr   	popsreg	       	; Get left operand
+
+    	lda   	#0              ; Clear byte 1
+       	ldy    	#8    	       	; Number of bits
+    	ldx   	sreg+1	       	; Get into register for speed
+        beq     mul8x8          ; Do 8x8 multiplication if high byte zero
+mul8x16a:
+    	sta   	ptr4+1          ; Clear byte 2
+
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+      	clc
+      	adc   	sreg
+      	pha
+       	txa   	    	       	; hi byte of left op
+      	adc   	ptr4+1
+      	sta   	ptr4+1
+      	pla
+
+@L1:    ror    	ptr4+1
+      	ror   	a
+      	ror   	ptr4
+        dey
+        bne     @L0
+        tax
+        lda     ptr4            ; Load the result
+        rts
+
+;---------------------------------------------------------------------------
+; 8x8 multiplication routine
+
+mul8x8:
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+        clc
+        adc     sreg
+@L1:    ror
+        ror     ptr4
+        dey
+        bne     @L0
+        tax
+    	lda	ptr4  	       	; Load the result
+    	rts	   		; Done
+