prog8/compiler/res/prog8lib/math.asm

; Internal Math library routines - always included by the compiler
; Generic machine independent 6502 code.
;
;  some more interesting routines can be found here:
;	http://6502org.wikidot.com/software-math
;	http://codebase64.org/doku.php?id=base:6502_6510_maths
;


math_store_reg	.byte  0		; temporary storage


multiply_bytes	.proc
	; -- multiply 2 bytes A and Y, result as byte in A  (signed or unsigned)
		sta  P8ZP_SCRATCH_B1         ; num1
		sty  P8ZP_SCRATCH_REG        ; num2
		lda  #0
		beq  _enterloop
_doAdd		clc
		adc  P8ZP_SCRATCH_B1
_loop		asl  P8ZP_SCRATCH_B1
_enterloop	lsr  P8ZP_SCRATCH_REG
		bcs  _doAdd
		bne  _loop
		rts
		.pend


multiply_bytes_into_word	.proc
	; -- multiply 2 bytes A and Y, result as word in A/Y (unsigned)
		sta  P8ZP_SCRATCH_B1
		sty  P8ZP_SCRATCH_REG
		stx  math_store_reg
		lda  #0
		ldx  #8
		lsr  P8ZP_SCRATCH_B1
-		bcc  +
		clc
		adc  P8ZP_SCRATCH_REG
+		ror  a
		ror  P8ZP_SCRATCH_B1
		dex
		bne  -
		tay
		lda  P8ZP_SCRATCH_B1
		ldx  math_store_reg
		rts
		.pend


multiply_words	.proc
	; -- multiply two 16-bit words into a 32-bit result  (signed and unsigned)
	;      input: A/Y = first 16-bit number, P8ZP_SCRATCH_W1 in ZP = second 16-bit number
	;      output: multiply_words.result  4-bytes/32-bits product, LSB order (low-to-high)
	;      clobbers: A

		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		stx  P8ZP_SCRATCH_REG

mult16		lda  #0
		sta  result+2	; clear upper bits of product
		sta  result+3
		ldx  #16			; for all 16 bits...
-	 	lsr  P8ZP_SCRATCH_W1+1	; divide multiplier by 2
		ror  P8ZP_SCRATCH_W1
		bcc  +
		lda  result+2	; get upper half of product and add multiplicand
		clc
		adc  P8ZP_SCRATCH_W2
		sta  result+2
		lda  result+3
		adc  P8ZP_SCRATCH_W2+1
+ 		ror  a				; rotate partial product
		sta  result+3
		ror  result+2
		ror  result+1
		ror  result
		dex
		bne  -
		ldx  P8ZP_SCRATCH_REG
		rts

result		.byte  0,0,0,0
		.pend


divmod_b_asm	.proc
	; signed byte division: make everything positive and fix sign afterwards
		sta  P8ZP_SCRATCH_B1
		tya
		eor  P8ZP_SCRATCH_B1
		php			; save sign
		lda  P8ZP_SCRATCH_B1
		bpl  +
		eor  #$ff
		sec
		adc  #0			; make it positive
+		pha
		tya
		bpl  +
		eor  #$ff
		sec
		adc  #0			; make it positive
		tay
+		pla
		jsr  divmod_ub_asm
		sta  _remainder
		plp
		bpl  +
		tya
		eor  #$ff
		sec
		adc  #0			; negate result
		tay
+		rts
_remainder	.byte  0
		.pend


divmod_ub_asm	.proc
	; -- divide A by Y, result quotient in Y, remainder in A   (unsigned)
	;    division by zero will result in quotient = 255 and remainder = original number
		sty  P8ZP_SCRATCH_REG
		sta  P8ZP_SCRATCH_B1
		stx  math_store_reg

		lda  #0
		ldx  #8
		asl  P8ZP_SCRATCH_B1
-		rol  a
		cmp  P8ZP_SCRATCH_REG
		bcc  +
		sbc  P8ZP_SCRATCH_REG
+		rol  P8ZP_SCRATCH_B1
		dex
		bne  -
		ldy  P8ZP_SCRATCH_B1
		ldx  math_store_reg
		rts
		.pend

divmod_w_asm	.proc
	; signed word division: make everything positive and fix sign afterwards
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		lda  P8ZP_SCRATCH_W1+1
		eor  P8ZP_SCRATCH_W2+1
		php			; save sign
		lda  P8ZP_SCRATCH_W1+1
		bpl  +
		lda  #0
		sec
		sbc  P8ZP_SCRATCH_W1
		sta  P8ZP_SCRATCH_W1
		lda  #0
		sbc  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W1+1
+		lda  P8ZP_SCRATCH_W2+1
		bpl  +
		lda  #0
		sec
		sbc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W2
		lda  #0
		sbc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W2+1
+		tay
		lda  P8ZP_SCRATCH_W2
		jsr  divmod_uw_asm
		plp			; restore sign
		bpl  +
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		lda  #0
		sec
		sbc  P8ZP_SCRATCH_W2
		pha
		lda  #0
		sbc  P8ZP_SCRATCH_W2+1
		tay
		pla
+		rts
		.pend

divmod_uw_asm	.proc
	; -- divide two unsigned words (16 bit each) into 16 bit results
	;    input:  P8ZP_SCRATCH_W1 in ZP: 16 bit number, A/Y: 16 bit divisor
	;    output: P8ZP_SCRATCH_W2 in ZP: 16 bit remainder, A/Y: 16 bit division result
	;    division by zero will result in quotient = 65535 and remainder = divident


dividend = P8ZP_SCRATCH_W1
remainder = P8ZP_SCRATCH_W2
result = dividend ;save memory by reusing divident to store the result

		sta  _divisor
		sty  _divisor+1
		stx  P8ZP_SCRATCH_REG
		lda  #0	        	;preset remainder to 0
		sta  remainder
		sta  remainder+1
		ldx  #16	        ;repeat for each bit: ...

-		asl  dividend		;dividend lb & hb*2, msb -> Carry
		rol  dividend+1
		rol  remainder		;remainder lb & hb * 2 + msb from carry
		rol  remainder+1
		lda  remainder
		sec
		sbc  _divisor		;substract divisor to see if it fits in
		tay	       		;lb result -> Y, for we may need it later
		lda  remainder+1
		sbc  _divisor+1
		bcc  +			;if carry=0 then divisor didn't fit in yet

		sta  remainder+1	;else save substraction result as new remainder,
		sty  remainder
		inc  result		;and INCrement result cause divisor fit in 1 times

+		dex
		bne  -

		lda  result
		ldy  result+1
		ldx  P8ZP_SCRATCH_REG
		rts
_divisor	.word 0
		.pend


randseed	.proc
	; -- reset the random seeds for the byte and word random generators
	;    arguments: uword seed in A/Y   clobbers A
	;    (default starting values are:  A=$2c Y=$9e)
		sta  randword._seed
		sty  randword._seed+1
		clc
		adc  #14
		sta  randbyte._seed
		rts
		.pend


randbyte        .proc
	; -- 8 bit pseudo random number generator into A (by just reusing randword)
		jmp  randword
		.pend

randword	.proc
	; -- 16 bit pseudo random number generator into AY

		; rand64k       ;Factors of 65535: 3 5 17 257
		lda sr1+1
		asl a
		asl a
		eor sr1+1
		asl a
		eor sr1+1
		asl a
		asl a
		eor sr1+1
		asl a
		rol sr1         ;shift this left, "random" bit comes from low
		rol sr1+1
		; rand32k       ;Factors of 32767: 7 31 151 are independent and can be combined
		lda sr2+1
		asl a
		eor sr2+1
		asl a
		asl a
		ror sr2         ;shift this right, random bit comes from high - nicer when eor with sr1
		rol sr2+1
		lda sr1+1         ;can be left out
		eor sr2+1         ;if you dont use
		tay               ;y as suggested
		lda sr1           ;mix up lowbytes of SR1
		eor sr2           ;and SR2 to combine both
		rts

sr1     	.word $a55a
sr2     	.word $7653

		.pend


; ----------- optimized multiplications (stack) : ---------
stack_mul_byte_3	.proc
		; X + X*2
		lda  P8ESTACK_LO+1,x
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_3	.proc
		; W*2 + W
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend


stack_mul_byte_5	.proc
		; X*4 + X
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_5	.proc
		; W*4 + W
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend


stack_mul_byte_6	.proc
		; (X*2 + X)*2
		lda  P8ESTACK_LO+1,x
		asl  a
                clc
		adc  P8ESTACK_LO+1,x
		asl  a
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_6	.proc
		; (W*2 + W)*2
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		asl  P8ESTACK_LO+1,x
                rol  a
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_7	.proc
		; X*8 - X
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		asl  a
		sec
		sbc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_7	.proc
		; W*8 - W
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		sec
		sbc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		sbc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_9	.proc
		; X*8 + X
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_9	.proc
		; W*8 + W
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_10	.proc
		; (X*4 + X)*2
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		asl  a
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_10	.proc
		; (W*4 + W)*2
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		asl  P8ESTACK_LO+1,x
                rol  a
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_11	.proc
		; (X*2 + X)*4 - X
		lda  P8ESTACK_LO+1,x
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		asl  a
		asl  a
		sec
		sbc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

; mul_word_11 is skipped (too much code)

stack_mul_byte_12	.proc
		; (X*2 + X)*4
		lda  P8ESTACK_LO+1,x
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		asl  a
		asl  a
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_12	.proc
		; (W*2 + W)*4
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		asl  P8ESTACK_LO+1,x
                rol  a
		asl  P8ESTACK_LO+1,x
                rol  a
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_13	.proc
		; (X*2 + X)*4 + X
		lda  P8ESTACK_LO+1,x
		asl  a
                clc
		adc  P8ESTACK_LO+1,x
		asl  a
		asl  a
                clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

; mul_word_13 is skipped (too much code)

stack_mul_byte_14	.proc
		; (X*8 - X)*2
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		asl  a
                sec
		sbc  P8ESTACK_LO+1,x
                asl  a
		sta  P8ESTACK_LO+1,x
		rts
		.pend

; mul_word_14 is skipped (too much code)

stack_mul_byte_15	.proc
		; X*16 - X
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		asl  a
		asl  a
		sec
		sbc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_15	.proc
		; W*16 - W
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		sec
		sbc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		sbc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_20	.proc
		; (X*4 + X)*4
		lda  P8ESTACK_LO+1,x
		asl  a
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		asl  a
		asl  a
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_20	.proc
		; (W*4 + W)*4
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		asl  P8ESTACK_LO+1,x
                rol  a
		asl  P8ESTACK_LO+1,x
                rol  a
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_25	.proc
		; (X*2 + X)*8 + X
		lda  P8ESTACK_LO+1,x
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		asl  a
		asl  a
		asl  a
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_25	.proc
		; W = (W*2 + W) *8 + W
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_40	.proc
		lda  P8ESTACK_LO+1,x
		and  #7
		tay
		lda  mul_byte_40._forties,y
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_40	.proc
		; (W*4 + W)*8
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_REG
		lda  P8ESTACK_LO+1,x
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		clc
		adc  P8ESTACK_LO+1,x
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_REG
		adc  P8ESTACK_HI+1,x
		asl  P8ESTACK_LO+1,x
                rol  a
		asl  P8ESTACK_LO+1,x
                rol  a
		asl  P8ESTACK_LO+1,x
                rol  a
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_50	.proc
		lda  P8ESTACK_LO+1,x
		and  #7
		tay
		lda  mul_byte_50._fifties, y
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_50	.proc
		; W = W * 25 * 2
		jsr  stack_mul_word_25
		asl  P8ESTACK_LO+1,x
		rol  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_80	.proc
		lda  P8ESTACK_LO+1,x
		and  #3
		tay
		lda  mul_byte_80._eighties, y
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_80	.proc
		; W = W * 40 * 2
		jsr  stack_mul_word_40
		asl  P8ESTACK_LO+1,x
		rol  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_byte_100	.proc
		lda  P8ESTACK_LO+1,x
		and  #3
		tay
		lda  mul_byte_100._hundreds, y
		sta  P8ESTACK_LO+1,x
		rts
		.pend

stack_mul_word_100	.proc
		; W = W * 25 * 4
		jsr  stack_mul_word_25
		asl  P8ESTACK_LO+1,x
		rol  P8ESTACK_HI+1,x
		asl  P8ESTACK_LO+1,x
		rol  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_word_320	.proc
		; stackW = stackLo * 256 + stackLo * 64	 (stackHi doesn't matter)
		ldy  P8ESTACK_LO+1,x
		lda  #0
		sta  P8ESTACK_HI+1,x
		tya
		asl  a
		rol  P8ESTACK_HI+1,x
		asl  a
		rol  P8ESTACK_HI+1,x
		asl  a
		rol  P8ESTACK_HI+1,x
		asl  a
		rol  P8ESTACK_HI+1,x
		asl  a
		rol  P8ESTACK_HI+1,x
		asl  a
		rol  P8ESTACK_HI+1,x
		sta  P8ESTACK_LO+1,x
		tya
		clc
		adc  P8ESTACK_HI+1,x
		sta  P8ESTACK_HI+1,x
		rts
		.pend

stack_mul_word_640	.proc
		; stackW = (stackLo * 2 * 320)    (stackHi doesn't matter)
		asl  P8ESTACK_LO+1,x
		jmp  stack_mul_word_320
		.pend


; ----------- optimized multiplications (in-place A (byte) and ?? (word)) : ---------
mul_byte_3	.proc
		; A = A + A*2
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_3	.proc
		; AY = AY*2 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend


mul_byte_5	.proc
		; A = A*4 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_5	.proc
		; AY = AY*4 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend


mul_byte_6	.proc
		; A = (A*2 + A)*2
		sta  P8ZP_SCRATCH_REG
		asl  a
                clc
                adc  P8ZP_SCRATCH_REG
		asl  a
		rts
		.pend

mul_word_6	.proc
		; AY = (AY*2 + AY)*2
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		tay
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		tya
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		ldy  P8ZP_SCRATCH_W1+1
		rts
		.pend

mul_byte_7	.proc
		; A = A*8 - A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		sec
		sbc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_7	.proc
		; AY = AY*8 - AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		sec
		sbc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		sbc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_9	.proc
		; A = A*8 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_9	.proc
		; AY = AY*8 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		rts
		.pend

mul_byte_10	.proc
		; A=(A*4 + A)*2
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		rts
		.pend

mul_word_10	.proc
		; AY=(AY*4 + AY)*2
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		ldy  P8ZP_SCRATCH_W1+1
		rts
		.pend

mul_byte_11	.proc
		; A=(A*2 + A)*4 - A
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		sec
		sbc  P8ZP_SCRATCH_REG
		rts
		.pend

; mul_word_11 is skipped (too much code)

mul_byte_12	.proc
		; A=(A*2 + A)*4
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		rts
		.pend

mul_word_12	.proc
		; AY=(AY*2 + AY)*4
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		ldy  P8ZP_SCRATCH_W1+1
		rts
		.pend

mul_byte_13	.proc
		; A=(A*2 + A)*4 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
                clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
                clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

; mul_word_13 is skipped (too much code)

mul_byte_14	.proc
		; A=(A*8 - A)*2
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
                sec
		sbc  P8ZP_SCRATCH_REG
                asl  a
		rts
		.pend

; mul_word_14 is skipped (too much code)

mul_byte_15	.proc
		; A=A*16 - A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		asl  a
		sec
		sbc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_15	.proc
		; AY = AY * 16 - AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		sec
		sbc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		sbc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_20	.proc
		; A=(A*4 + A)*4
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		rts
		.pend

mul_word_20	.proc
		; AY = AY * 10 * 2
		jsr  mul_word_10
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_byte_25	.proc
		; A=(A*2 + A)*8 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_25	.proc
		; AY = (AY*2 + AY) *8 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_40	.proc
		and  #7
		tay
		lda  _forties,y
		rts
_forties	.byte  0*40, 1*40, 2*40, 3*40, 4*40, 5*40, 6*40, 7*40 & 255
		.pend

mul_word_40	.proc
		; AY = (AY*4 + AY)*8
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		asl  P8ZP_SCRATCH_W1
		rol  a
		asl  P8ZP_SCRATCH_W1
		rol  a
		asl  P8ZP_SCRATCH_W1
		rol  a
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_50	.proc
		and  #7
		tay
		lda  _fifties, y
		rts
_fifties	.byte  0*50, 1*50, 2*50, 3*50, 4*50, 5*50, 6*50 & 255, 7*50 & 255
		.pend

mul_word_50	.proc
		; AY = AY * 25 * 2
		jsr  mul_word_25
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_byte_80	.proc
		and  #3
		tay
		lda  _eighties, y
		rts
_eighties	.byte  0*80, 1*80, 2*80, 3*80
		.pend

mul_word_80	.proc
		; AY = AY * 40 * 2
		jsr  mul_word_40
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_byte_100	.proc
		and  #3
		tay
		lda  _hundreds, y
		rts
_hundreds	.byte  0*100, 1*100, 2*100, 3*100 & 255
		.pend

mul_word_100	.proc
		; AY = AY * 25 * 4
		jsr  mul_word_25
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_320	.proc
		; AY = A * 256 + A * 64	 (msb in Y doesn't matter)
		sta  P8ZP_SCRATCH_B1
		ldy  #0
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		pha
		clc
		lda  P8ZP_SCRATCH_B1
		adc  P8ZP_SCRATCH_REG
		tay
		pla
		rts
		.pend

mul_word_640	.proc
		; AY = (A * 2 * 320) (msb in Y doesn't matter)
		asl  a
		jmp  mul_word_320
		.pend


; ----------- end optimized multiplications -----------


; bit shifts.
; anything below 3 is done inline. anything above 7 is done via other optimizations.

shift_left_w_7	.proc
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_LO+1,x

		asl  a
		rol  P8ZP_SCRATCH_B1
_shift6		asl  a
		rol  P8ZP_SCRATCH_B1
_shift5		asl  a
		rol  P8ZP_SCRATCH_B1
_shift4		asl  a
		rol  P8ZP_SCRATCH_B1
_shift3		asl  a
		rol  P8ZP_SCRATCH_B1
		asl  a
		rol  P8ZP_SCRATCH_B1
		asl  a
		rol  P8ZP_SCRATCH_B1

		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_B1
		sta  P8ESTACK_HI+1,x
		rts
		.pend

shift_left_w_6	.proc
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_LO+1,x
		jmp  shift_left_w_7._shift6
		.pend

shift_left_w_5	.proc
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_LO+1,x
		jmp  shift_left_w_7._shift5
		.pend

shift_left_w_4	.proc
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_LO+1,x
		jmp  shift_left_w_7._shift4
		.pend

shift_left_w_3	.proc
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_LO+1,x
		jmp  shift_left_w_7._shift3
		.pend


shift_left_w	.proc
		; -- variable number of shifts left
		inx
		ldy  P8ESTACK_LO,x
		bne  _shift
		rts
_shift		asl  P8ESTACK_LO+1,x
		rol  P8ESTACK_HI+1,x
		dey
		bne  _shift
		rts
		.pend

shift_right_uw	.proc
		; -- uword variable number of shifts right
		inx
		ldy  P8ESTACK_LO,x
		bne  _shift
		rts
_shift		lsr  P8ESTACK_HI+1,x
		ror  P8ESTACK_LO+1,x
		dey
		bne  _shift
		rts
		.pend

shift_right_uw_7	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_HI+1,x

		lsr  a
		ror  P8ZP_SCRATCH_B1
_shift6		lsr  a
		ror  P8ZP_SCRATCH_B1
_shift5		lsr  a
		ror  P8ZP_SCRATCH_B1
_shift4		lsr  a
		ror  P8ZP_SCRATCH_B1
_shift3		lsr  a
		ror  P8ZP_SCRATCH_B1
		lsr  a
		ror  P8ZP_SCRATCH_B1
		lsr  a
		ror  P8ZP_SCRATCH_B1

		sta  P8ESTACK_HI+1,x
		lda  P8ZP_SCRATCH_B1
		sta  P8ESTACK_LO+1,x
		rts
		.pend

shift_right_uw_6	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_HI+1,x
		jmp  shift_right_uw_7._shift6
		.pend

shift_right_uw_5	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_HI+1,x
		jmp  shift_right_uw_7._shift5
		.pend

shift_right_uw_4	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_HI+1,x
		jmp  shift_right_uw_7._shift4
		.pend

shift_right_uw_3	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_B1
		lda  P8ESTACK_HI+1,x
		jmp  shift_right_uw_7._shift3
		.pend


shift_right_w_7		.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_W1
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1

		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1

		lda  P8ZP_SCRATCH_W1+1
_shift6		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
_shift5		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
_shift4		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
_shift3		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		asl  a
		ror  P8ZP_SCRATCH_W1+1
		ror  P8ZP_SCRATCH_W1

		lda  P8ZP_SCRATCH_W1
		sta  P8ESTACK_LO+1,x
		lda  P8ZP_SCRATCH_W1+1
		sta  P8ESTACK_HI+1,x
		rts
		.pend

shift_right_w_6	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_W1
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1
		jmp  shift_right_w_7._shift6
		.pend

shift_right_w_5	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_W1
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1
		jmp  shift_right_w_7._shift5
		.pend

shift_right_w_4	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_W1
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1
		jmp  shift_right_w_7._shift4
		.pend

shift_right_w_3	.proc
		lda  P8ESTACK_LO+1,x
		sta  P8ZP_SCRATCH_W1
		lda  P8ESTACK_HI+1,x
		sta  P8ZP_SCRATCH_W1+1
		jmp  shift_right_w_7._shift3
		.pend


shift_right_w	.proc
		; -- signed word variable number of shifts right
		inx
		ldy  P8ESTACK_LO,x
		bne  _shift
		rts
_shift		lda  P8ESTACK_HI+1,x
		asl  a
		ror  P8ESTACK_HI+1,x
		ror  P8ESTACK_LO+1,x
		dey
		bne  _shift
		rts
		.pend


; support for bit shifting that is too large to be unrolled:

lsr_byte_A	.proc
		; -- lsr signed byte in A times the value in Y (assume >0)
		cmp  #0
		bmi  _negative
-		lsr  a
		dey
		bne  -
		rts
_negative	lsr  a
		ora  #$80
		dey
		bne  _negative
		rts
		.pend


square          .proc
; -- calculate square root of signed word in AY, result in AY
; routine by Lee Davsion, source: http://6502.org/source/integers/square.htm
; using this routine is about twice as fast as doing a regular multiplication.
;
; Calculates the 16 bit unsigned integer square of the signed 16 bit integer in
; Numberl/Numberh.  The result is always in the range 0 to 65025 and is held in
; Squarel/Squareh
;
; The maximum input range is only +/-255 and no checking is done to ensure that
; this is so.
;
; This routine is useful if you are trying to draw circles as for any circle
;
; x^2+y^2=r^2 where x and y are the co-ordinates of any point on the circle and
; r is the circle radius

numberl = P8ZP_SCRATCH_W1       ; number to square low byte
numberh = P8ZP_SCRATCH_W1+1     ; number to square high byte
squarel = P8ZP_SCRATCH_W2       ; square low byte
squareh = P8ZP_SCRATCH_W2+1     ; square high byte
tempsq = P8ZP_SCRATCH_B1        ; temp byte for intermediate result

	sta  numberl
	sty  numberh
	stx  P8ZP_SCRATCH_REG

        lda     #$00        ; clear a
        sta     squarel     ; clear square low byte
                            ; (no need to clear the high byte, it gets shifted out)
        lda	numberl     ; get number low byte
	ldx	numberh     ; get number high  byte
	bpl	_nonneg      ; if +ve don't negate it
                            ; else do a two's complement
	eor	#$ff        ; invert
        sec	            ; +1
	adc	#$00        ; and add it

_nonneg:
	sta	tempsq      ; save abs(number)
	ldx	#$08        ; set bit count

_nextr2bit:
	asl	squarel     ; low byte *2
	rol	squareh     ; high byte *2+carry from low
	asl	a           ; shift number byte
	bcc	_nosqadd     ; don't do add if c = 0
	tay                 ; save a
	clc                 ; clear carry for add
	lda	tempsq      ; get number
	adc	squarel     ; add number^2 low byte
	sta	squarel     ; save number^2 low byte
	lda	#$00        ; clear a
	adc	squareh     ; add number^2 high byte
	sta	squareh     ; save number^2 high byte
	tya                 ; get a back

_nosqadd:
	dex                 ; decrement bit count
	bne	_nextr2bit   ; go do next bit

	lda  squarel
	ldy  squareh
	ldx  P8ZP_SCRATCH_REG
	rts

		.pend