; Internal Math library routines - always included by the compiler
; Generic machine independent 6502 code.
;
;  some more interesting routines can be found here:
;	http://6502org.wikidot.com/software-math
;	http://codebase64.org/doku.php?id=base:6502_6510_maths
;       https://github.com/TobyLobster/multiply_test
;       https://github.com/TobyLobster/sqrt_test


multiply_bytes	.proc
	; -- multiply 2 bytes A and Y, result as byte in A  (signed or unsigned)
		sta  P8ZP_SCRATCH_B1         ; num1
		sty  P8ZP_SCRATCH_REG        ; num2
		lda  #0
		beq  _enterloop
_doAdd		clc
		adc  P8ZP_SCRATCH_B1
_loop		asl  P8ZP_SCRATCH_B1
_enterloop	lsr  P8ZP_SCRATCH_REG
		bcs  _doAdd
		bne  _loop
		rts
		.pend


multiply_bytes_into_word	.proc
	; -- multiply 2 bytes A and Y, result as word in A/Y (unsigned)
		sta  P8ZP_SCRATCH_B1
		sty  P8ZP_SCRATCH_REG
		lda  #0
		ldx  #8
		lsr  P8ZP_SCRATCH_B1
-		bcc  +
		clc
		adc  P8ZP_SCRATCH_REG
+		ror  a
		ror  P8ZP_SCRATCH_B1
		dex
		bne  -
		tay
		lda  P8ZP_SCRATCH_B1
		rts
		.pend


multiply_words	.proc
	; -- multiply two 16-bit words into a 32-bit result  (signed and unsigned)
	;      input: A/Y = first 16-bit number, P8ZP_SCRATCH_W1 in ZP = second 16-bit number
	;      output: multiply_words.result  4-bytes/32-bits product, LSB order (low-to-high)  low 16 bits also in AY.

		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1

mult16		lda  #0
		sta  result+2	; clear upper bits of product
		sta  result+3
		ldx  #16			; for all 16 bits...
-	 	lsr  P8ZP_SCRATCH_W1+1	; divide multiplier by 2
		ror  P8ZP_SCRATCH_W1
		bcc  +
		lda  result+2	; get upper half of product and add multiplicand
		clc
		adc  P8ZP_SCRATCH_W2
		sta  result+2
		lda  result+3
		adc  P8ZP_SCRATCH_W2+1
+ 		ror  a				; rotate partial product
		sta  result+3
		ror  result+2
		ror  result+1
		ror  result
		dex
		bne  -
		lda  result
		ldy  result+1
		rts

result		.byte  0,0,0,0
		.pend


divmod_b_asm	.proc
	; signed byte division: make everything positive and fix sign afterwards
		sta  P8ZP_SCRATCH_B1
		tya
		eor  P8ZP_SCRATCH_B1
		php			; save sign
		lda  P8ZP_SCRATCH_B1
		bpl  +
		eor  #$ff
		sec
		adc  #0			; make it positive
+		pha
		tya
		bpl  +
		eor  #$ff
		sec
		adc  #0			; make it positive
		tay
+		pla
		jsr  divmod_ub_asm
		sta  _remainder
		plp
		bpl  +
		tya
		eor  #$ff
		sec
		adc  #0			; negate result
		tay
+		rts
_remainder	.byte  0
		.pend


divmod_ub_asm	.proc
	; -- divide A by Y, result quotient in Y, remainder in A   (unsigned)
	;    division by zero will result in quotient = 255 and remainder = original number
		sty  P8ZP_SCRATCH_REG
		sta  P8ZP_SCRATCH_B1

		lda  #0
		ldx  #8
		asl  P8ZP_SCRATCH_B1
-		rol  a
		cmp  P8ZP_SCRATCH_REG
		bcc  +
		sbc  P8ZP_SCRATCH_REG
+		rol  P8ZP_SCRATCH_B1
		dex
		bne  -
		ldy  P8ZP_SCRATCH_B1
		rts
		.pend

divmod_w_asm	.proc
	; signed word division: make everything positive and fix sign afterwards
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		lda  P8ZP_SCRATCH_W1+1
		eor  P8ZP_SCRATCH_W2+1
		php			; save sign
		lda  P8ZP_SCRATCH_W1+1
		bpl  +
		lda  #0
		sec
		sbc  P8ZP_SCRATCH_W1
		sta  P8ZP_SCRATCH_W1
		lda  #0
		sbc  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W1+1
+		lda  P8ZP_SCRATCH_W2+1
		bpl  +
		lda  #0
		sec
		sbc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W2
		lda  #0
		sbc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W2+1
+		tay
		lda  P8ZP_SCRATCH_W2
		jsr  divmod_uw_asm
		plp			; restore sign
		bpl  +
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		lda  #0
		sec
		sbc  P8ZP_SCRATCH_W2
		pha
		lda  #0
		sbc  P8ZP_SCRATCH_W2+1
		tay
		pla
+		rts
		.pend

divmod_uw_asm	.proc
	; -- divide two unsigned words (16 bit each) into 16 bit results
	;    input:  P8ZP_SCRATCH_W1 in ZP: 16 bit number, A/Y: 16 bit divisor
	;    output: P8ZP_SCRATCH_W2 in ZP: 16 bit remainder, A/Y: 16 bit division result
	;    division by zero will result in quotient = 65535 and remainder = divident


dividend = P8ZP_SCRATCH_W1
remainder = P8ZP_SCRATCH_W2
result = dividend ;save memory by reusing divident to store the result

		sta  _divisor
		sty  _divisor+1
		lda  #0	        	;preset remainder to 0
		sta  remainder
		sta  remainder+1
		ldx  #16	        ;repeat for each bit: ...

-		asl  dividend		;dividend lb & hb*2, msb -> Carry
		rol  dividend+1
		rol  remainder		;remainder lb & hb * 2 + msb from carry
		rol  remainder+1
		lda  remainder
		sec
		sbc  _divisor		;substract divisor to see if it fits in
		tay	       		;lb result -> Y, for we may need it later
		lda  remainder+1
		sbc  _divisor+1
		bcc  +			;if carry=0 then divisor didn't fit in yet

		sta  remainder+1	;else save substraction result as new remainder,
		sty  remainder
		inc  result		;and INCrement result cause divisor fit in 1 times

+		dex
		bne  -

		lda  result
		ldy  result+1
		rts
_divisor	.word 0
		.pend


randword	.proc
	; -- 16 bit pseudo random number generator into AY
	;    default seed = $00c2 $1137
        ;    routine from https://codebase64.org/doku.php?id=base:x_abc_random_number_generator_8_16_bit
		inc x1
		clc
x1=*+1
		lda #$00	;x1
c1=*+1
		eor #$c2	;c1
a1=*+1
		eor #$11	;a1
		sta a1
b1=*+1
		adc #$37	;b1
		sta b1
		lsr a
		eor a1
		adc c1
		sta c1
		ldy b1
		rts
		.pend

randbyte = randword    ; -- 8 bit pseudo random number generator into A (by just reusing randword)


; ----------- optimized multiplications (in-place A (byte) and ?? (word)) : ---------
mul_byte_3	.proc
		; A = A + A*2
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_3	.proc
		; AY = AY*2 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend


mul_byte_5	.proc
		; A = A*4 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_5	.proc
		; AY = AY*4 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend


mul_byte_6	.proc
		; A = (A*2 + A)*2
		sta  P8ZP_SCRATCH_REG
		asl  a
                clc
                adc  P8ZP_SCRATCH_REG
		asl  a
		rts
		.pend

mul_word_6	.proc
		; AY = (AY*2 + AY)*2
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		tay
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		tya
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		ldy  P8ZP_SCRATCH_W1+1
		rts
		.pend

mul_byte_7	.proc
		; A = A*8 - A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		sec
		sbc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_7	.proc
		; AY = AY*8 - AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		sec
		sbc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		sbc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_9	.proc
		; A = A*8 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_9	.proc
		; AY = AY*8 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		rts
		.pend

mul_byte_10	.proc
		; A=(A*4 + A)*2
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		rts
		.pend

mul_word_10	.proc
		; AY=(AY*4 + AY)*2
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		ldy  P8ZP_SCRATCH_W1+1
		rts
		.pend

mul_byte_11	.proc
		; A=(A*2 + A)*4 - A
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		sec
		sbc  P8ZP_SCRATCH_REG
		rts
		.pend

; mul_word_11 is skipped (too much code)

mul_byte_12	.proc
		; A=(A*2 + A)*4
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		rts
		.pend

mul_word_12	.proc
		; AY=(AY*2 + AY)*4
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		ldy  P8ZP_SCRATCH_W1+1
		rts
		.pend

mul_byte_13	.proc
		; A=(A*2 + A)*4 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
                clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
                clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

; mul_word_13 is skipped (too much code)

mul_byte_14	.proc
		; A=(A*8 - A)*2
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
                sec
		sbc  P8ZP_SCRATCH_REG
                asl  a
		rts
		.pend

; mul_word_14 is skipped (too much code)

mul_byte_15	.proc
		; A=A*16 - A
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		asl  a
		sec
		sbc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_15	.proc
		; AY = AY * 16 - AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		sec
		sbc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		sbc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_20	.proc
		; A=(A*4 + A)*4
		sta  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		rts
		.pend

mul_word_20	.proc
		; AY = AY * 10 * 2
		jsr  mul_word_10
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_byte_25	.proc
		; A=(A*2 + A)*8 + A
		sta  P8ZP_SCRATCH_REG
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		asl  a
		asl  a
		asl  a
		clc
		adc  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_25	.proc
		; AY = (AY*2 + AY) *8 + AY
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		sta  P8ZP_SCRATCH_W1+1
		lda  P8ZP_SCRATCH_W1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_40	.proc
		and  #7
		tay
		lda  _forties,y
		rts
_forties	.byte  0*40, 1*40, 2*40, 3*40, 4*40, 5*40, 6*40, 7*40 & 255
		.pend

mul_word_40	.proc
		; AY = (AY*4 + AY)*8
		sta  P8ZP_SCRATCH_W1
		sty  P8ZP_SCRATCH_W1+1
		sta  P8ZP_SCRATCH_W2
		sty  P8ZP_SCRATCH_W2+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		asl  a
		rol  P8ZP_SCRATCH_W1+1
		clc
		adc  P8ZP_SCRATCH_W2
		sta  P8ZP_SCRATCH_W1
		lda  P8ZP_SCRATCH_W1+1
		adc  P8ZP_SCRATCH_W2+1
		asl  P8ZP_SCRATCH_W1
		rol  a
		asl  P8ZP_SCRATCH_W1
		rol  a
		asl  P8ZP_SCRATCH_W1
		rol  a
		tay
		lda  P8ZP_SCRATCH_W1
		rts
		.pend

mul_byte_50	.proc
		and  #7
		tay
		lda  _fifties, y
		rts
_fifties	.byte  0*50, 1*50, 2*50, 3*50, 4*50, 5*50, 6*50 & 255, 7*50 & 255
		.pend

mul_word_50	.proc
		; AY = AY * 25 * 2
		jsr  mul_word_25
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_byte_80	.proc
		and  #3
		tay
		lda  _eighties, y
		rts
_eighties	.byte  0*80, 1*80, 2*80, 3*80
		.pend

mul_word_80	.proc
		; AY = AY * 40 * 2
		jsr  mul_word_40
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_byte_100	.proc
		and  #3
		tay
		lda  _hundreds, y
		rts
_hundreds	.byte  0*100, 1*100, 2*100, 3*100 & 255
		.pend

mul_word_100	.proc
		; AY = AY * 25 * 4
		jsr  mul_word_25
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		ldy  P8ZP_SCRATCH_REG
		rts
		.pend

mul_word_320	.proc
		; AY = A * 256 + A * 64	 (msb in Y doesn't matter)
		sta  P8ZP_SCRATCH_B1
		ldy  #0
		sty  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		asl  a
		rol  P8ZP_SCRATCH_REG
		pha
		clc
		lda  P8ZP_SCRATCH_B1
		adc  P8ZP_SCRATCH_REG
		tay
		pla
		rts
		.pend

mul_word_640	.proc
		; AY = (A * 2 * 320) (msb in Y doesn't matter)
		asl  a
		jmp  mul_word_320
		.pend


; ----------- end optimized multiplications -----------


; support for bit shifting that is too large to be unrolled:

lsr_byte_A	.proc
		; -- lsr signed byte in A times the value in Y (>1)
		cmp  #0
		bpl  lsr_ubyte_A
-       	sec
		ror  a
		dey
		bne  -
		rts
		.pend

lsr_ubyte_A	.proc
		; -- lsr unsigned byte in A times the value in Y (>1)
-		lsr  a
		dey
		bne  -
		rts
		.pend

asl_byte_A      .proc
		; -- asl any byte in A times the value in Y (>1)
-		asl  a
		dey
		bne  -
		rts
		.pend


lsr_word_AY     .proc
		; -- lsr signed word in AY times the value in X (>1)
		cpy  #0
		bpl  lsr_uword_AY
		sty  P8ZP_SCRATCH_B1
_negative       sec
		ror  P8ZP_SCRATCH_B1
		ror  a
		dex
		bne  _negative
		ldy  P8ZP_SCRATCH_B1
		rts
		.pend

lsr_uword_AY    .proc
		; -- lsr unsigned word in AY times the value in X (>1)
		sty  P8ZP_SCRATCH_B1
-		lsr  P8ZP_SCRATCH_B1
		ror  a
		dex
		bne  -
		ldy  P8ZP_SCRATCH_B1
		rts
		.pend

asl_word_AY     .proc
		; -- asl any word in AY times the value in X (>1)
		sty  P8ZP_SCRATCH_B1
-               asl  a
		rol  P8ZP_SCRATCH_B1
		dex
		bne  -
		ldy  P8ZP_SCRATCH_B1
		rts
		.pend


square          .proc
; -- calculate square of signed word (actually -255..255) in AY, result in AY
; routine by Lee Davison, source: http://6502.org/source/integers/square.htm
; using this routine is a lot faster as doing a regular multiplication (for words)
;
; Calculates the 16 bit unsigned integer square of the signed 16 bit integer in
; Numberl/Numberh.  The result is always in the range 0 to 65025 and is held in
; Squarel/Squareh
;
; The maximum input range is only +/-255 and no checking is done to ensure that
; this is so.
;
; This routine is useful if you are trying to draw circles as for any circle
;
; x^2+y^2=r^2 where x and y are the co-ordinates of any point on the circle and
; r is the circle radius

numberl = P8ZP_SCRATCH_W1       ; number to square low byte
numberh = P8ZP_SCRATCH_W1+1     ; number to square high byte
squarel = P8ZP_SCRATCH_W2       ; square low byte
squareh = P8ZP_SCRATCH_W2+1     ; square high byte
tempsq = P8ZP_SCRATCH_B1        ; temp byte for intermediate result

	sta  numberl
	sty  numberh

        lda     #$00        ; clear a
        sta     squarel     ; clear square low byte
                            ; (no need to clear the high byte, it gets shifted out)
        lda	numberl     ; get number low byte
	ldx	numberh     ; get number high  byte
	bpl	_nonneg      ; if +ve don't negate it
                            ; else do a two's complement
	eor	#$ff        ; invert
        sec	            ; +1
	adc	#$00        ; and add it

_nonneg:
	sta	tempsq      ; save abs(number)
	ldx	#$08        ; set bit count

_nextr2bit:
	asl	squarel     ; low byte *2
	rol	squareh     ; high byte *2+carry from low
	asl	a           ; shift number byte
	bcc	_nosqadd     ; don't do add if c = 0
	tay                 ; save a
	clc                 ; clear carry for add
	lda	tempsq      ; get number
	adc	squarel     ; add number^2 low byte
	sta	squarel     ; save number^2 low byte
	lda	#$00        ; clear a
	adc	squareh     ; add number^2 high byte
	sta	squareh     ; save number^2 high byte
	tya                 ; get a back

_nosqadd:
	dex                 ; decrement bit count
	bne	_nextr2bit   ; go do next bit

	lda  squarel
	ldy  squareh
	rts

		.pend