diff --git a/graphics/gr/donut/Makefile b/graphics/gr/donut/Makefile
new file mode 100644
index 00000000..c2b22e1d
--- /dev/null
+++ b/graphics/gr/donut/Makefile
@@ -0,0 +1,34 @@
+include ../../../Makefile.inc
+
+LINKER_DIR = ../../../linker_scripts/
+
+EMPTY_DISK = ../../../empty_disk/empty.dsk
+DOS33 = ../../../utils/dos33fs-utils/dos33
+TOKENIZE = ../../../utils/asoft_basic-utils/tokenize_asoft
+
+all:	donut.dsk
+
+donut.dsk:	HELLO DONUT
+	cp $(EMPTY_DISK) donut.dsk
+	$(DOS33) -y donut.dsk SAVE A HELLO
+	$(DOS33) -y donut.dsk BSAVE -a 0xc00 DONUT
+
+###
+
+HELLO:	hello.bas
+	$(TOKENIZE) < hello.bas > HELLO
+
+####
+
+DONUT:	donut.o
+	ld65 -o DONUT donut.o -C $(LINKER_DIR)/apple2_c00.inc
+
+donut.o:	donut.s
+	ca65 -o donut.o donut.s -l donut.lst
+
+
+####
+
+clean:		
+		rm -f *~ *.o *.lst HELLO DONUT
+
diff --git a/graphics/gr/donut/donut.s b/graphics/gr/donut/donut.s
new file mode 100644
index 00000000..79bf60c3
--- /dev/null
+++ b/graphics/gr/donut/donut.s
@@ -0,0 +1,1674 @@
+; donut
+; based on donut code by @a1k0n
+
+; zero-page
+GBASL	=	$26
+GBASH	=	$27
+
+sB_l	=	$70
+sB_h	=	$71
+cB_l	=	$72
+cB_h	=	$73
+sA_l	=	$74
+sA_h	=	$75
+cA_l	=	$76
+cA_h	=	$77
+sAsB_l	=	$78
+sAsB_h	=	$79
+cAsB_l	=	$7A
+cAsB_h	=	$7B
+sAcB_l	=	$7C
+sAcB_h	=	$7D
+cAcB_l	=	$7E
+cAcB_h	=	$7F
+
+r1i_l	=	$80
+r1i_h	=	$81
+r2i_l	=	$82
+r2i_h	=	$83
+p0x_l	=	$84
+p0x_h	=	$85
+p0y_l	=	$86
+p0y_h	=	$87
+p0z_l	=	$88
+p0z_h	=	$89
+yincC_l	=	$8A
+yincC_h	=	$8B
+yincS_l	=	$8C
+yincS_h	=	$8D
+xincX_l	=	$8E
+xincX_h	=	$8F
+xincY_l	=	$90
+xincY_h	=	$91
+xincZ_l	=	$92
+xincZ_h	=	$93
+ycA_l	=	$94
+ycA_h	=	$95
+ysA_l	=	$96
+ysA_h	=	$97
+xsAsB_l	=	$98
+xsAsB_h	=	$99
+xcAsB_l	=	$9A
+xcAsB_h	=	$9B
+vxi14_l	=	$9C
+vxi14_h	=	$9D
+vyi14_l	=	$9E
+vyi14_h	=	$9F
+vzi14_l	=	$A0
+vzi14_h	=	$A1
+t_l	=	$A2
+t_h	=	$A3
+px_l	=	$A4
+px_h	=	$A5
+py_l	=	$A6
+py_h	=	$A7
+pz_l	=	$A8
+pz_h	=	$A9
+lx0_l	=	$AA
+lx0_h	=	$AB
+ly0_l	=	$AC
+ly0_h	=	$AD
+lz0_l	=	$AE
+lz0_h	=	$AF
+t0_l	=	$B0
+t0_h	=	$B1
+t1_l	=	$B2
+t1_h	=	$B3
+t2_l	=	$B4
+t2_h	=	$B5
+d_l	=	$B6
+d_h	=	$B7
+lx_l	=	$B8
+lx_h	=	$B9
+ly_l	=	$BA
+ly_h	=	$BB
+lz_l	=	$BC
+lz_h	=	$BD
+n_l	=	$BE
+n_h	=	$BF
+dx_l	=	$C0
+dx_h	=	$C1
+dy_l	=	$C2
+dy_h	=	$C3
+dz_l	=	$C4
+dz_h	=	$C5
+a_l	=	$C6
+a_h	=	$C7
+b_l	=	$C8
+b_h	=	$C9
+c_l	=	$CA
+c_h	=	$CB
+arg1_l	=	$CC
+arg1_h	=	$CD
+arg2_l	=	$CE
+arg2_h	=	$CF
+arg3_l	=	$D0
+arg3_h	=	$D1
+arg4_l	=	$D2
+arg4_h	=	$D3
+
+CTEMP1_L	=	$D4
+CTEMP1_H	=	$D5
+CTEMP2_L	=	$D6
+CTEMP2_H	=	$D7
+CTEMP3_L	=	$D8
+CTEMP3_H	=	$D9
+CTEMP4_L	=	$DA
+CTEMP4_H	=	$DB
+
+
+
+TEMP1_L	=	$F0
+TEMP1_H	=	$F1
+TEMP2_L	=	$F2
+TEMP2_H	=	$F3
+
+XX	=	$F4
+YY	=	$F5
+II	=	$F6
+
+; soft-switches
+FULLGR  =       $C052
+
+; ROM routines
+PLOT	=	$F800   ;; PLOT AT Y,A
+SETGR	=	$FB40
+
+
+donut:
+	jsr	SETGR
+	bit	FULLGR
+
+init_vars:
+	; high-precision rotation directions
+	; sines and cosines and their products
+
+	; int16_t sB = 0, cB = 16384;
+	lda	#0
+	sta	sB_l
+	sta	sB_h
+	lda	#<16384
+	sta	cB_l
+	lda	#>16384
+	sta	cB_h
+
+	; int16_t sA = 11583, cA = 11583;
+	lda	#<11583
+	sta	sA_l
+	sta	cA_l
+	lda	#>11583
+	sta	sA_h
+	sta	cA_h
+
+	; int16_t sAsB = 0, cAsB = 0;
+	lda	#<0
+	sta	sAsB_l
+	sta	cAsB_l
+	lda	#>0
+	sta	sAsB_h
+	sta	cAsB_h
+
+	; int16_t sAcB = 11583, cAcB = 11583;
+
+	lda	#<11583
+	sta	sAcB_l
+	sta	cAcB_l
+	lda	#>11583
+	sta	sAcB_h
+	sta	cAcB_h
+
+	; FIXME: propogate
+	; const int16_t r1i = 256;
+	; const int16_t r2i = 2*256;
+	lda	#<256
+	sta	r1i_l
+	lda	#>256
+	sta	r1i_h
+
+	lda	#<512
+	sta	r2i_l
+	lda	#>512
+	sta	r2i_h
+
+main_loop:
+	; int16_t p0x = (sB + (sB<<2)) >> 6;
+
+	lda	sB_l		; p0x = SB
+	sta	p0x_l
+	lda	sB_h
+	sta	p0x_h
+
+	asl	p0x_l		; p0x = SB<<1
+	rol	p0x_h
+	asl	p0x_l		; p0x = SB<<2
+	rol	p0x_h
+
+	clc			; p0x = SB + (SB<<2)
+	lda	p0x_l
+	adc	sB_l
+	sta	p0x_l
+	lda	p0x_h
+	adc	sB_h
+	sta	p0x_h
+
+	ldx	#6		; p0x = (SB + (SB<<2)) >> 6
+p1:
+	lda	p0x_h
+;	cmp	#$80
+;	ror
+;	sta	p0x_h
+
+	lsr	p0x_h
+
+	ror	p0x_l
+	dex
+	bne	p1
+
+	; int16_t p0y = (sAcB + (sAcB<<2)) >> 6;
+
+	; urgh math is done in 32-bit before casting down to 16
+
+	lda	sAcB_l
+	sta	p0y_l
+	lda	sAcB_h
+	sta	p0y_h
+
+	asl	p0y_l	; 2d3f*5 = e23b / 0x40 = 388
+	rol	p0y_h
+	asl	p0y_l	; expect 0x388
+	rol	p0y_h
+
+	clc
+	lda	p0y_l
+	adc	sAcB_l
+	sta	p0y_l
+	lda	p0y_h
+	adc	sAcB_h
+	sta	p0y_h
+
+	ldx	#6
+p2:
+;	lda	p0y_h
+;	cmp	#$80
+;	ror
+;	sta	p0y_h
+
+	lsr	p0y_h
+	ror	p0y_l
+	dex
+	bne	p2
+
+	; int16_t p0z = (- (cAcB +(cAcB<<2))) >> 6;
+
+	lda	cAcB_l
+	sta	p0z_l
+	lda	cAcB_h
+	sta	p0z_h
+
+	asl	p0z_l
+	rol	p0z_h
+	asl	p0z_l
+	rol	p0z_h
+
+	clc
+	lda	p0z_l
+	adc	cAcB_l
+	sta	p0z_l
+	lda	p0z_h
+	adc	cAcB_h
+	sta	p0z_h
+
+
+	ldx	#6
+p3:
+;	lda	p0z_h
+;	cmp	#$80
+;	ror
+;	sta	p0z_h
+
+	lsr	p0z_h
+	ror	p0z_l
+	dex
+	bne	p3
+
+	; negate
+	sec
+	lda	#0
+	sbc	p0z_l
+	sta	p0z_l
+	lda	#0
+	sbc	p0z_h
+	sta	p0z_h
+
+
+	;========================================
+	; int16_t yincC = (cA >> 6) + (cA >> 5);
+	;  tested: OK
+
+	lda	cA_l		; TEMP1 = cA
+	sta	TEMP1_L
+	lda	cA_h
+	sta	TEMP1_H
+
+	ldx	#5		; TEMP1 = (cA>>5)
+p4:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p4
+
+	lda	TEMP1_L		; TEMP2 = (cA>>6)
+	sta	TEMP2_L
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+
+	clc			; yincC = TEMP1+TEMP2
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	yincC_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	yincC_h
+
+	;========================================
+	; int16_t yincS = (sA >> 6) + (sA >> 5);
+	;   tested: OK
+
+	lda	sA_l
+	sta	TEMP1_L
+	lda	sA_h
+	sta	TEMP1_H
+
+	ldx	#5
+p5:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p5
+
+
+	lda	TEMP1_L
+	sta	TEMP2_L
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+
+	clc
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	yincS_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	yincS_h
+
+	;=========================================
+	; int16_t xincX = (cB >> 7) + (cB >> 6);
+	;	CB = $72	xincX= 8E
+	; Tested: OK
+
+	lda	cB_l		; TEMP1 = CB
+	sta	TEMP1_L
+	lda	cB_h
+	sta	TEMP1_H
+
+	ldx	#6		; TEMP1 = CB>>6
+p6:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p6
+
+	lda	TEMP1_L		; TEMP2 = CB>>7
+	sta	TEMP2_L
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+
+	clc			; xincx = TEMP1+TEMP2
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	xincX_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	xincX_h
+
+	;===========================================
+	; int16_t xincY = (sAsB >> 7) + (sAsB >> 6);
+
+	lda	sAsB_l
+	sta	TEMP1_L
+	lda	sAsB_h
+	sta	TEMP1_H
+
+	ldx	#6
+p7:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p7
+
+
+	lda	TEMP1_L
+	sta	TEMP2_L
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+
+	clc
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	xincY_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	xincY_h
+
+	;===========================================
+	; int16_t xincZ = (cAsB >> 7) + (cAsB >> 6);
+
+	lda	cAsB_l
+	sta	TEMP1_L
+	lda	cAsB_h
+	sta	TEMP1_H
+
+	ldx	#6
+p8:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p8
+
+
+	lda	TEMP1_L
+	sta	TEMP2_L
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+
+	clc
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	xincZ_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	xincZ_h
+
+	;========================================
+	; int16_t ycA = -((cA >> 1) + (cA >> 4));
+	;	ycA = $94
+	; tested: OK
+
+	lda	cA_l		; TEMP1 = CA
+	sta	TEMP1_L
+	lda	cA_h
+	sta	TEMP1_H
+
+	cmp	#$80		; TEMP1 = CA>>1
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+
+	lda	TEMP1_L		; TEMP2 = CA>>1
+	sta	TEMP2_L
+	lda	TEMP1_H
+	sta	TEMP2_H
+
+	ldx	#3		; TEMP2 = CA>>4
+p9:
+	lda	TEMP2_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+	dex
+	bne	p9
+
+	; add
+	clc			; yCA = TEMP1+TEMP2
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	ycA_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	ycA_h
+
+	; negate
+	sec			; yCA = -(TEMP1+TEMP2)
+	lda	#0
+	sbc	ycA_l
+	sta	ycA_l
+	lda	#0
+	sbc	ycA_h
+	sta	ycA_h
+
+
+	;========================================
+	; int16_t ysA = -((sA >> 1) + (sA >> 4));
+
+	lda	sA_l
+	sta	TEMP1_L
+	lda	sA_h
+	sta	TEMP1_H
+
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+
+	lda	TEMP1_L
+	sta	TEMP2_L
+	lda	TEMP1_H
+	sta	TEMP2_H
+
+	ldx	#3
+p10:
+	lda	TEMP2_H
+	cmp	#$80
+	ror
+	sta	TEMP2_H
+	ror	TEMP2_L
+	dex
+	bne	p10
+
+	; add
+	clc
+	lda	TEMP2_L
+	adc	TEMP1_L
+	sta	ysA_l
+	lda	TEMP2_H
+	adc	TEMP1_H
+	sta	ysA_h
+
+	; negate
+	sec
+	lda	#0
+	sbc	ysA_l
+	sta	ysA_l
+	lda	#0
+	sbc	ysA_h
+	sta	ysA_h
+
+	;=================================
+	; for (int j = 0; j < 23; j++) {
+	ldx	#0
+	stx	YY
+yloop:
+	ldx	YY
+	lda	gr_offsets_l,X
+	sta	GBASL
+	lda	gr_offsets_h,X
+	sta	GBASH			; TODO: add in page
+
+	;======================================
+	; int16_t xsAsB = (sAsB >> 4) - sAsB;  ; -40*xincY
+	;  tested: OK
+
+	lda	sAsB_l		; xsAsB = sAsB
+	sta	xsAsB_l
+	lda	sAsB_h
+	sta	xsAsB_h
+
+	ldx	#4
+p99:				; xsAsB = sAsB>>4
+	lda	xsAsB_h
+	cmp	#$80
+	ror
+	sta	xsAsB_h
+	ror	xsAsB_l
+	dex
+	bne	p99
+
+	sec			; xsAsB = (sAsB>>4)-sAsB
+	lda	xsAsB_l
+	sbc	sAsB_l
+	sta	xsAsB_l
+	lda	xsAsB_h
+	sbc	sAsB_h
+	sta	xsAsB_h
+
+
+	; int16_t xcAsB = (cAsB >> 4) - cAsB;  ; -40*xincZ;
+
+	lda	cAsB_l
+	sta	xcAsB_l
+	lda	cAsB_h
+	sta	xcAsB_h
+
+	ldx	#4
+p98:
+	lda	xcAsB_h
+	cmp	#$80
+	ror
+	sta	xcAsB_h
+	ror	xcAsB_l
+	dex
+	bne	p98
+
+	sec
+	lda	xcAsB_l
+	sbc	cAsB_l
+	sta	xcAsB_l
+	lda	xcAsB_h
+	sbc	cAsB_h
+	sta	xcAsB_h
+
+	;==============================
+	; int16_t vxi14 = (cB >> 4) - cB - sB; ; -40*xincX - sB;
+	;	cxi14 = $9C
+	; tested OK
+
+	lda	cB_l		; vxi14 = cB
+	sta	vxi14_l
+	lda	cB_h
+	sta	vxi14_h
+
+	ldx	#4		; vxi14 = (cB>>4)
+p97:
+	lda	vxi14_h
+	cmp	#$80
+	ror
+	sta	vxi14_h
+	ror	vxi14_l
+	dex
+	bne	p97
+
+	sec			; vxi14 = (cb>>4) - CB
+	lda	vxi14_l
+	sbc	cB_l
+	sta	vxi14_l
+	lda	vxi14_h
+	sbc	cB_h
+	sta	vxi14_h
+
+	sec			; cxi14 = (cb>>4) - CB - SB
+	lda	vxi14_l
+	sbc	sB_l
+	sta	vxi14_l
+	lda	vxi14_h
+	sbc	sB_h
+	sta	vxi14_h
+
+	; int16_t vyi14 = ycA - xsAsB - sAcB;
+	sec
+	lda	ycA_l		; vyi14 = yCA - xsAsB
+	sbc	xsAsB_l
+	sta	vyi14_l
+	lda	ycA_h
+	sbc	xsAsB_h
+	sta	vyi14_h
+
+	sec			; vyi14 = yCA - xsAsB - sAcB
+	lda	vyi14_l
+	sbc	sAcB_l
+	sta	vyi14_l
+	lda	vyi14_h
+	sbc	sAcB_h
+	sta	vyi14_h
+
+	; int16_t vzi14 = ysA + xcAsB + cAcB;
+
+	clc			; vzi14 = ysA + xcAsB
+	lda	ysA_l
+	adc	xcAsB_l
+	sta	vzi14_l
+	lda	ysA_h
+	adc	xcAsB_h
+	sta	vzi14_h
+
+	clc			; vzi14 = ysA + xcAsB + cAcB
+	lda	vzi14_l
+	adc	cAcB_l
+	sta	vzi14_l
+	lda	vzi14_h
+	adc	cAcB_h
+	sta	vzi14_h
+
+
+	; for (int i = 0; i < 79; i++) {
+	ldy	#0
+	sty	XX
+xloop:
+
+	; int16_t t = 512;
+	lda	#$2
+	sta	t_h
+	lda	#0
+	sta	t_l
+
+	; int16_t px = p0x + (vxi14 >> 5);
+
+	lda	vxi14_l		; px = (vxi14)
+	sta	px_l
+	lda	vxi14_h
+	sta	px_h
+
+	ldx	#5		; px = (vxi14>>5)
+p96:
+	lda	px_h
+	cmp	#$80
+	ror
+	sta	px_h
+	ror	px_l
+	dex
+	bne	p96
+
+	clc			; px = p0x + (vxi14>>5)
+	lda	px_l
+	adc	p0x_l
+	sta	px_l
+	lda	px_h
+	adc	p0x_h
+	sta	px_h
+
+	; int16_t py = p0y + (vyi14 >> 5);
+
+	lda	vyi14_l
+	sta	py_l
+	lda	vyi14_h
+	sta	py_h
+
+	ldx	#5
+p95:
+	lda	py_h
+	cmp	#$80
+	ror
+	sta	py_h
+	ror	py_l
+	dex
+	bne	p95
+
+	clc
+	lda	py_l
+	adc	p0y_l
+	sta	py_l
+	lda	py_h
+	adc	p0y_h
+	sta	py_h
+
+	; int16_t pz = p0z + (vzi14 >> 5);
+
+	lda	vzi14_l
+	sta	pz_l
+	lda	vzi14_h
+	sta	pz_h
+
+	ldx	#5
+p94:
+	lda	pz_h
+	cmp	#$80
+	ror
+	sta	pz_h
+	ror	pz_l
+	dex
+	bne	p94
+
+	clc
+	lda	pz_l
+	adc	p0z_l
+	sta	pz_l
+	lda	pz_h
+	adc	p0z_h
+	sta	pz_h
+
+	; int16_t lx0 = sB >> 2;
+
+	lda	sB_l
+	sta	lx0_l
+	lda	sB_h
+	sta	lx0_h
+
+	ldx	#2
+p93:
+	lda	lx0_h
+	cmp	#$80
+	ror
+	sta	lx0_h
+	ror	lx0_l
+	dex
+	bne	p93
+
+	; int16_t ly0 = (sAcB - cA) >> 2;
+
+	sec
+	lda	sAcB_l
+	sbc	cA_l
+	sta	ly0_l
+	lda	sAcB_h
+	sbc	cA_h
+	sta	ly0_h
+
+	ldx	#2
+p92:
+	lda	ly0_h
+	cmp	#$80
+	ror
+	sta	ly0_h
+	ror	ly0_l
+	dex
+	bne	p92
+
+
+	; int16_t lz0 = (-cAcB - sA) >> 2;
+
+	; negate
+	sec			; lz0 = -cAcB
+	lda	#0
+	sbc	cAcB_l
+	sta	lz0_l
+	lda	#0
+	sbc	cAcB_h
+	sta	lz0_h
+
+	sec			; lz0 = (-cAcB - sA)
+	lda	lz0_l
+	sbc	sA_l
+	sta	lz0_l
+	lda	lz0_h
+	sbc	sA_h
+	sta	lz0_h
+
+	ldx	#2
+p91:
+	lda	lz0_h
+	cmp	#$80
+	ror
+	sta	lz0_h
+	ror	lz0_l
+	dex
+	bne	p91
+
+
+color_loop:
+	;=========================================
+	; int16_t lx = lx0, ly = ly0, lz = lz0;
+	;	AA, AC ,AE
+	; Tested OK!
+
+	lda	lx0_l
+	sta	lx_l
+	lda	lx0_h
+	sta	lx_h
+
+	lda	ly0_l
+	sta	ly_l
+	lda	ly0_h
+	sta	ly_h
+
+	lda	lz0_l
+	sta	lz_l
+	lda	lz0_h
+	sta	lz_h
+
+	;=====================================
+	; t0 = length_cordic(px, py, &lx, ly);
+	;	px=A4, py=A6
+	; TEST OK!
+
+	lda	px_l
+	sta	arg1_l
+	lda	px_h
+	sta	arg1_h
+
+	lda	py_l
+	sta	arg2_l
+	lda	py_h
+	sta	arg2_h
+
+	lda	lx_l
+	sta	arg3_l
+	lda	lx_h
+	sta	arg3_h
+
+	lda	ly_l
+	sta	arg4_l
+	lda	ly_h
+	sta	arg4_h
+
+	jsr	length_cordic
+
+	;===========================
+	; T0 = $B0
+	;	check OK!
+
+	lda	arg1_l
+	sta	t0_l
+	lda	arg1_h
+	sta	t0_h
+
+	lda	arg3_l
+	sta	lx_l
+	lda	arg3_h
+	sta	lx_h
+
+
+	;===============
+	; t1 = t0 - r2i;
+	;	t1=$B2
+	; TESTED OK
+
+	sec
+	lda	t0_l
+	sbc	r2i_l
+	sta	t1_l
+	lda	t0_h
+	sbc	r2i_h
+	sta	t1_h
+
+
+	;=====================================
+	; t2 = length_cordic(pz, t1, &lz, lx);
+	;	PZ=$A8 LZ=$BC
+	; TESTED: PZ is off by 1
+
+	lda	pz_l
+	sta	arg1_l
+	lda	pz_h
+	sta	arg1_h
+
+	lda	t1_l
+	sta	arg2_l
+	lda	t1_h
+	sta	arg2_h
+
+	lda	lz_l
+	sta	arg3_l
+	lda	lz_h
+	sta	arg3_h
+
+	lda	lx_l
+	sta	arg4_l
+	lda	lx_h
+	sta	arg4_h
+
+	jsr	length_cordic
+
+	;========================
+	; after
+	;	check arg1=$CC arg3=$D0
+	;	T2=B4 LZ=BC
+	; TEST OK
+
+	lda	arg1_l
+	sta	t2_l
+	lda	arg1_h
+	sta	t2_h
+
+	lda	arg3_l
+	sta	lz_l
+	lda	arg3_h
+	sta	lz_h
+
+	;==============
+	; d = t2 - r1i;
+	; TEST: ok
+	sec
+	lda	t2_l
+	sbc	r1i_l
+	sta	d_l
+	lda	t2_h
+	sbc	r1i_h
+	sta	d_h
+
+
+	;========
+	; t += d;
+
+	clc
+	lda	t_l
+	adc	d_l
+	sta	t_l
+	lda	t_h
+	adc	d_h
+	sta	t_h
+
+	; check if too far
+
+	; T in $A2, D in $B6
+	;	CORRECT
+
+check_t:
+	lda	t_h			; if (t > 8*256)
+	cmp	#8
+	bcc	check_d
+
+	lda	#0			; black
+	beq	do_plot_color		; bra
+
+check_d:
+	lda	d_l	; if (d<2)
+	cmp	#2	; 16-bit signed compare
+	lda	d_h
+	sbc	#0
+	bvc	label
+	eor	#$80
+label:
+	; if N=1 then less
+
+	bpl	skip_plot		; bge
+
+	;int N = lz >> 9;
+	lda	lz_h
+	cmp	#$80
+	ror
+
+	tax
+	bmi	n_negative
+	cpx	#12
+	bcc	do_plot		; blt
+	; if (N>11) N=11;
+	ldx	#11
+	bne	do_plot		; bra
+
+n_negative:
+	; if (N<0) N=0;
+	ldx	#0
+
+;	beq	do_plot		; bra
+
+	; fall through
+do_plot:
+	lda	colors,X
+do_plot_color:
+	; T=A2
+	pha
+	lda	XX
+	lsr
+	tay
+	pla
+	sta	(GBASL),Y
+
+	jmp	done_color
+
+skip_plot:
+
+	; 11x1.14 fixed point 3x parallel multiply
+	; only 16 bit registers needed; starts from highest bit to lowest
+	; d is about 2..1100, so 11 bits are sufficient
+
+	; int16_t dx = 0, dy = 0, dz = 0;
+	lda	#0
+	sta	dx_l
+	sta	dx_h
+	sta	dy_l
+	sta	dy_h
+	sta	dz_l
+	sta	dz_h
+
+	; int16_t a = vxi14, b = vyi14, c = vzi14;
+	lda	vxi14_l
+	sta	a_l
+	lda	vxi14_h
+	sta	a_h
+
+	lda	vyi14_l
+	sta	b_l
+	lda	vyi14_h
+	sta	b_h
+
+	lda	vzi14_l
+	sta	c_l
+	lda	vzi14_h
+	sta	c_h
+
+	; check a,b,c = c6, c8, cA
+	; TEST: OK
+
+mul_loop:
+	lda	d_h			; while(d)
+	bne	cont_mul_loop
+	lda	d_l
+	beq	done_mul_loop
+cont_mul_loop:
+	lda	d_h
+	and	#(4)		; if (d&1024) {
+	beq	not_1023
+
+	; dx += a;
+	clc
+	lda	dx_l
+	adc	a_l
+	sta	dx_l
+	lda	dx_h
+	adc	a_h
+	sta	dx_h
+
+	; dy += b;
+	clc
+	lda	dy_l
+	adc	b_l
+	sta	dy_l
+	lda	dy_h
+	adc	b_h
+	sta	dy_h
+
+	; dz += c;
+	clc
+	lda	dz_l
+	adc	c_l
+	sta	dz_l
+	lda	dz_h
+	adc	c_h
+	sta	dz_h
+
+not_1023:
+	; d = (d&1023) << 1;
+	lda	d_h
+	and	#$3
+	sta	d_h
+
+	asl	d_l	; d<<=1
+	rol	d_h
+
+	; a >>= 1;
+
+	lda	a_h
+	cmp	#$80
+	ror
+	sta	a_h
+	ror	a_l
+
+	; b >>= 1;
+
+	lda	b_h
+	cmp	#$80
+	ror
+	sta	b_h
+	ror	b_l
+
+	; c >>= 1;
+
+	lda	c_h
+	cmp	#$80
+	ror
+	sta	c_h
+	ror	c_l
+
+	jmp	mul_loop
+done_mul_loop:
+
+	; we already shifted down 10 bits, so get the last four
+	; DX=C0 / C2 / C4
+	; px=A4 / a6 / a8
+	; TEST: GOOD
+
+	; px += dx >> 4;
+
+	lda	dx_l
+	sta	TEMP1_L
+	lda	dx_h
+	sta	TEMP1_H
+
+	ldx	#4
+p90:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p90
+
+	clc
+	lda	px_l
+	adc	TEMP1_L
+	sta	px_l
+	lda	px_h
+	adc	TEMP1_H
+	sta	px_h
+
+	;===================
+	; py += dy >> 4;
+
+	lda	dy_l
+	sta	TEMP1_L
+	lda	dy_h
+	sta	TEMP1_H
+
+	ldx	#4
+p89:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p89
+
+	clc
+	lda	py_l
+	adc	TEMP1_L
+	sta	py_l
+	lda	py_h
+	adc	TEMP1_H
+	sta	py_h
+
+	; pz += dz >> 4;
+
+	lda	dz_l
+	sta	TEMP1_L
+	lda	dz_h
+	sta	TEMP1_H
+
+	ldx	#4
+p88:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	p88
+
+	clc
+	lda	pz_l
+	adc	TEMP1_L
+	sta	pz_l
+	lda	pz_h
+	adc	TEMP1_H
+	sta	pz_h
+
+	; AFTER
+	; px=ok, py=unchanged?
+
+	jmp	color_loop
+done_color:
+
+	; vxi14 += xincX;
+	clc
+	lda	vxi14_l
+	adc	xincX_l
+	sta	vxi14_l
+	lda	vxi14_h
+	adc	xincX_h
+	sta	vxi14_h
+
+	; vyi14 -= xincY;
+	sec
+	lda	vyi14_l
+	sbc	xincY_l
+	sta	vyi14_l
+	lda	vyi14_h
+	sbc	xincY_h
+	sta	vyi14_h
+
+	; vzi14 += xincZ;
+	clc
+	lda	vzi14_l
+	adc	xincZ_l
+	sta	vzi14_l
+	lda	vzi14_h
+	adc	xincZ_h
+	sta	vzi14_h
+
+
+	inc	XX
+	ldy	XX
+	cpy	#80
+	beq	done_xloop
+	jmp	xloop
+
+done_xloop:
+
+	;==============
+	; ycA += yincC;
+
+	clc
+	lda	ycA_l
+	adc	yincC_l
+	sta	ycA_l
+	lda	ycA_h
+	adc	yincC_h
+	sta	ycA_h
+
+	;===============
+	; ysA += yincS;
+
+	clc
+	lda	ysA_l
+	adc	yincS_l
+	sta	ysA_l
+	lda	ysA_h
+	adc	yincS_h
+	sta	ysA_h
+
+	inc	YY
+	lda	YY
+	cmp	#24
+	beq	done_yloop
+
+	jmp	yloop
+
+done_yloop:
+
+rotate:
+.if 0
+	; rotate sines, cosines, and products thereof
+	; this animates the torus rotation about two axes
+
+	cA-=(sA>>5);
+	sA+=(cA>>5);
+
+	cAsB-=(sAsB>>5);
+	sAsB+=(cAsB>>5);
+
+	cAcB-=(sAcB>>5);
+	sAcB+=(cAcB>>5);
+
+	cB-=(sB>>6);
+	sB+=(cB>>6);
+
+	cAcB-=(cAsB>>6);
+	cAsB+=(cAcB>>6);
+
+	sAcB-=(sAsB>>6);
+	sAsB+=(sAcB>>6);
+
+	; flip pages?
+
+	; usleep(15000);
+
+.endif
+
+	jmp	main_loop
+
+
+; CORDIC algorithm to find magnitude of |x,y| by rotating the x,y vector onto
+; the x axis. This also brings vector (x2,y2) along for the ride, and writes
+; back to x2 -- this is used to rotate the lighting vector from the normal of
+; the torus surface towards the camera, and thus determine the lighting amount.
+; We only need to keep one of the two lighting normal coordinates.
+
+; ARG1=x,ARG2=y,ARG3=x2,ARG4=y2
+
+; int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) {
+
+length_cordic:
+
+	lda	arg1_h	;  if (arg1 < 0) { ; start in right half-plane
+	bpl	no_adjust
+
+	; arg1 = -arg1;
+	; negate
+	sec
+	lda	#0
+	sbc	arg1_l
+	sta	arg1_l
+	lda	#0
+	sbc	arg1_h
+	sta	arg1_h
+
+	; arg3 = -arg3;
+	; negate
+	sec
+	lda	#0
+	sbc	arg3_l
+	sta	arg3_l
+	lda	#0
+	sbc	arg3_h
+	sta	arg3_h
+
+no_adjust:
+	; check here
+	;  arg1=CC, CE, D0, D2
+	;	TEST OK
+
+	; for (int i = 0; i < 8; i++) {
+	lda	#0
+	sta	II
+cordic_loop:
+	; int16_t temp1 = arg1;
+	lda	arg1_l
+	sta	TEMP1_L
+	lda	arg1_h
+	sta	TEMP1_H
+
+	; int16_t temp2 = arg3;
+	lda	arg3_l
+	sta	TEMP2_L
+	lda	arg3_h
+	sta	TEMP2_H
+
+	; CTEMP1 = arg2>>i
+
+	lda	arg2_l
+	sta	CTEMP1_L
+	lda	arg2_h
+	sta	CTEMP1_H
+	ldx	II
+	beq	done_pc1
+pc1:
+	lda	CTEMP1_H
+	cmp	#$80
+	ror
+	sta	CTEMP1_H
+	ror	CTEMP1_L
+	dex
+	bne	pc1
+done_pc1:
+
+	; CTEMP2 = temp1>>i
+
+	lda	TEMP1_L
+	sta	CTEMP2_L
+	lda	TEMP1_H
+	sta	CTEMP2_H
+	ldx	II
+	beq	done_pc2
+pc2:
+	lda	CTEMP2_H
+	cmp	#$80
+	ror
+	sta	CTEMP2_H
+	ror	CTEMP2_L
+	dex
+	bne	pc2
+done_pc2:
+
+	; CTEMP3 = arg4>>i
+
+	lda	arg4_l
+	sta	CTEMP3_L
+	lda	arg4_h
+	sta	CTEMP3_H
+	ldx	II
+	beq	done_pc3
+pc3:
+	lda	CTEMP3_H
+	cmp	#$80
+	ror
+	sta	CTEMP3_H
+	ror	CTEMP3_L
+	dex
+	bne	pc3
+done_pc3:
+
+	; CTEMP4 = temp2>>i
+	lda	TEMP2_L
+	sta	CTEMP4_L
+	lda	TEMP2_H
+	sta	CTEMP4_H
+	ldx	II
+	beq	done_pc4
+pc4:
+	lda	CTEMP4_H
+	cmp	#$80
+	ror
+	sta	CTEMP4_H
+	ror	CTEMP4_L
+	dex
+	bne	pc4
+done_pc4:
+
+	;=========================
+
+	lda	arg2_h		;    if (arg2 < 0) {
+	bpl	cordic_pos
+cordic_neg:
+
+	; arg1 -= CTEMP1
+	sec
+	lda	arg1_l
+	sbc	CTEMP1_L
+	sta	arg1_l
+	lda	arg1_h
+	sbc	CTEMP1_H
+	sta	arg1_h
+
+	; arg2 += CTEMP2
+	clc
+	lda	arg2_l
+	adc	CTEMP2_L
+	sta	arg2_l
+	lda	arg2_h
+	adc	CTEMP2_H
+	sta	arg2_h
+
+	; arg3 -= CTEMP3
+	sec
+	lda	arg3_l
+	sbc	CTEMP3_L
+	sta	arg3_l
+	lda	arg3_h
+	sbc	CTEMP3_H
+	sta	arg3_h
+
+	; arg4 += CTEMP4
+	clc
+	lda	arg4_l
+	adc	CTEMP4_L
+	sta	arg4_l
+	lda	arg4_h
+	adc	CTEMP4_H
+	sta	arg4_h
+
+	jmp	cordic_if_done
+cordic_pos:
+
+	; arg1 += CTEMP1
+	clc
+	lda	arg1_l
+	adc	CTEMP1_L
+	sta	arg1_l
+	lda	arg1_h
+	adc	CTEMP1_H
+	sta	arg1_h
+
+	; arg2 -= CTEMP2
+	sec
+	lda	arg2_l
+	sbc	CTEMP2_L
+	sta	arg2_l
+	lda	arg2_h
+	sbc	CTEMP2_H
+	sta	arg2_h
+
+	; arg3 += CTEMP3
+	clc
+	lda	arg3_l
+	adc	CTEMP3_L
+	sta	arg3_l
+	lda	arg3_h
+	adc	CTEMP3_H
+	sta	arg3_h
+
+	; arg4 -= CTEMP4
+	sec
+	lda	arg4_l
+	sbc	CTEMP4_L
+	sta	arg4_l
+	lda	arg4_h
+	sbc	CTEMP4_H
+	sta	arg4_h
+
+cordic_if_done:
+
+	inc	II
+	lda	II
+	cmp	#8
+	beq	cordic_loop_done
+	jmp	cordic_loop
+
+cordic_loop_done:
+
+
+	; divide by 0.625 as a c]heap approximation
+	; to the 0.607 scaling factor factor
+	; introduced by this algorithm
+	; (see https://en.wikipedia.org/wiki/CORDIC)
+
+	;  *arg3_ = (arg3 >> 1) + (arg3 >> 3);
+
+	lda	arg3_l		; TEMP1 = arg3
+	sta	TEMP1_L
+	lda	arg3_h
+	sta	TEMP1_H
+
+	lda	TEMP1_H		; TEMP1 = arg3>>1
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+
+	lda	TEMP1_L		; arg3 = arg3>>1
+	sta	arg3_l
+	lda	TEMP1_H
+	sta	arg3_h
+
+	ldx	#2		; TEMP1 = arg3>>3
+cp5:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	cp5
+
+	clc
+	lda	arg3_l
+	adc	TEMP1_L
+	sta	arg3_l
+	lda	arg3_h
+	adc	TEMP1_H
+	sta	arg3_h
+
+	;  return (arg1 >> 1) + (arg1 >> 3);
+
+	lda	arg1_l		; TEMP1 = arg1
+	sta	TEMP1_L
+	lda	arg1_h
+	sta	TEMP1_H
+
+	lda	TEMP1_H		; TEMP1 = arg1>>1
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+
+	lda	TEMP1_L		; arg1 = arg1>>1
+	sta	arg1_l
+	lda	TEMP1_H
+	sta	arg1_h
+
+	ldx	#2		; TEMP1 = arg1>>3
+cp6:
+	lda	TEMP1_H
+	cmp	#$80
+	ror
+	sta	TEMP1_H
+	ror	TEMP1_L
+	dex
+	bne	cp6
+
+	clc
+	lda	arg1_l
+	adc	TEMP1_L
+	sta	arg1_l
+	lda	arg1_h
+	adc	TEMP1_H
+	sta	arg1_h
+
+	rts
+
+
+	; 0 2 2 6 6 5 5 7  7 15 15 15
+	; 2 2 6 6 5 5 7 7 15 15 15 15
+colors:
+	.byte $20,$22,$62,$66,$56,$55,$55,$75,$77,$F7,$FF,$FF
+
+
+gr_offsets_h:
+	.byte	>$400,>$480,>$500,>$580,>$600,>$680,>$700,>$780
+	.byte	>$428,>$4a8,>$528,>$5a8,>$628,>$6a8,>$728,>$7a8
+	.byte	>$450,>$4d0,>$550,>$5d0,>$650,>$6d0,>$750,>$7d0
+
+
+gr_offsets_l:
+	.byte	<$400,<$480,<$500,<$580,<$600,<$680,<$700,<$780
+	.byte	<$428,<$4a8,<$528,<$5a8,<$628,<$6a8,<$728,<$7a8
+	.byte	<$450,<$4d0,<$550,<$5d0,<$650,<$6d0,<$750,<$7d0
diff --git a/graphics/gr/donut/hello.bas b/graphics/gr/donut/hello.bas
new file mode 100644
index 00000000..0e3aae36
--- /dev/null
+++ b/graphics/gr/donut/hello.bas
@@ -0,0 +1,2 @@
+5 HOME
+40 PRINT CHR$(4)"CATALOG"
diff --git a/utils/gr-sim/donut/donut.c b/utils/gr-sim/donut/donut.c
index 8bd8790c..8102cb03 100644
--- a/utils/gr-sim/donut/donut.c
+++ b/utils/gr-sim/donut/donut.c
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
+#include <stdlib.h>
 
 #include "gr-sim.h"
 #include "tfv_utils.h"
@@ -18,10 +19,15 @@ int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) {
 
   int16_t x2 = *x2_;
 
+//printf("before: x=0x%hx y=0x%hx x2=0x%hx y2=0x%hx\n",x,y,x2,y2);
+
   if (x < 0) { // start in right half-plane
     x = -x;
     x2 = -x2;
   }
+
+//printf("after: x=0x%hx y=0x%hx x2=0x%hx y2=0x%hx\n",x,y,x2,y2);
+
   for (int i = 0; i < 8; i++) {
     int16_t t = x;
     int16_t t2 = x2;
@@ -36,6 +42,7 @@ int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) {
       x2 += y2 >> i;
       y2 -= t2 >> i;
     }
+	//printf("(%d) x=0x%hx y=0x%hx x2=0x%hx y2=0x%hx\n",i,x,y,x2,y2);
   }
   // divide by 0.625 as a cheap approximation to the 0.607 scaling factor factor
   // introduced by this algorithm (see https://en.wikipedia.org/wiki/CORDIC)
@@ -43,7 +50,6 @@ int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) {
   return (x >> 1) + (x >> 3);
 }
 
-
 int main(int argc, char **argv) {
 
 	int ch;
@@ -58,17 +64,20 @@ int main(int argc, char **argv) {
 	int16_t sAsB = 0, cAsB = 0;
 	int16_t sAcB = 11583, cAcB = 11583;
 
+	const int16_t r1i = 256;
+	const int16_t r2i = 2*256;
+
 	while(1) {
 
 		// yes this is a multiply but dz is 5
 		// so it's (sb + (sb<<2)) >> 6 effectively
 
-		int p0x = (5 * sB) >> 6;
-		int p0y = (5 * sAcB) >> 6;
-		int p0z = (-5 * cAcB) >> 6;
+		/* urgh math is done in 32-bit before casting to 16? */
+		int16_t p0x = (sB + (sB<<2)) >> 6;
+		int16_t p0y = (sAcB + (sAcB<<2)) >> 6;
+		int16_t p0z = (- (cAcB +(cAcB<<2))) >> 6;
+
 
-		const int16_t r1i = 256;
-		const int16_t r2i = 2*256;
 
     int16_t yincC = (cA >> 6) + (cA >> 5);      // 12*cA >> 8;
     int16_t yincS = (sA >> 6) + (sA >> 5);      // 12*sA >> 8;
@@ -79,8 +88,7 @@ int main(int argc, char **argv) {
     int16_t ysA = -((sA >> 1) + (sA >> 4));     // -12 * yinc2 = -9*sA >> 4;
 
     for (int j = 0; j < 23; j++) {
-	ycA += yincC;
-	 ysA += yincS;
+
       int16_t xsAsB = (sAsB >> 4) - sAsB;  // -40*xincY
       int16_t xcAsB = (cAsB >> 4) - cAsB;  // -40*xincZ;
 
@@ -89,9 +97,7 @@ int main(int argc, char **argv) {
       int16_t vzi14 = ysA + xcAsB + cAcB;
 
       for (int i = 0; i < 79; i++) {
-		vxi14 += xincX;
-		vyi14 -= xincY;
-	 	vzi14 += xincZ;
+
         int16_t t = 512;
 
         int16_t px = p0x + (vxi14 >> 5);
@@ -105,19 +111,51 @@ int main(int argc, char **argv) {
           int16_t t0, t1, t2, d;
           int16_t lx = lx0, ly = ly0, lz = lz0;
 
+	//printf("lx0=0x%hx ly0=0x%hx lz0=0x%hx\n",lx0,ly0,lz0);
+	//printf("px=0x%hx py=0x%hx lx=0x%hx ly=0x%hx\n",px,py,lx,ly);
+
+
           t0 = length_cordic(px, py, &lx, ly);
 
-          t1 = t0 - r2i;
-          t2 = length_cordic(pz, t1, &lz, lx);
+	//printf("after cord t0=0x%hx lx=0x%hx\n",t0,lx);
+
+	t1 = t0 - r2i;
+
+	//printf("t1=0x%hx\n",t1);
+
+
+	//printf("pz=0x%hx t1=0x%hx lz=0x%hx lx=0x%hx\n",pz,t1,lz,lx);
+	t2 = length_cordic(pz, t1, &lz, lx);
+
+	//printf("after: t2=0x%hx lz=0x%hx\n",t2,lz);
+
           d = t2 - r1i;
+
+	//printf("d=0x%hx\n",d);
+
           t += d;
 
+	//printf("t=0x%hx\n",t);
 		// 0 2 2 6 6 5 5 7  7 15 15 15
 		// 2 2 6 6 5 5 7 7 15 15 15 15
 	int color_hi[12]={0, 2, 2, 6, 6, 5, 5, 5, 7, 7, 15, 15 };
 	int color_lo[12]={2, 2, 6, 6, 5, 5, 5, 7, 7, 15, 15, 15 };
 
+	//printf("r1i=0x%hx r2i=0x%hx\n",r1i,r2i);
+	//printf("sB=0x%hx sAcB=0x%hx cAcB=0x%x\n",sB,sAcB,cAcB);
+	//printf("p0x=0x%hx p0y=0x%hx p0z=0x%x\n",p0x,p0y,p0z);
+	//printf("cA=0x%hx yincC=0x%hx\n",cA,yincC);
+	//printf("sA=0x%hx yincS=0x%hx\n",sA,yincS);
+	//printf("cB=0x%hx xincX=0x%hx\n",cB,xincX);
+	//printf("ycA=0x%hx\n",ycA);
+	//printf("xsAsB=0x%hx sAsB=0x%hx\n",xsAsB,sAsB);
+	//printf("vxi14=0x%hx\n",vxi14);
+	//printf("lx0=0x%hx ly0=0x%hx lz0=0x%hx\n",lx0,ly0,lz0);
+	//printf("t=0x%hx d=0x%hx\n",t,d);
+
+
           if (t > 8*256) {
+//		printf("%d: t=0x%hx d=0x%hx 0\n",i,t,d);
 		color_equals(0);
 		plot(i/2,j*2);
 		plot(i/2,(j*2)+1);
@@ -127,28 +165,26 @@ int main(int argc, char **argv) {
 		if (N<0) N=0;
 		if (N>11) N=11;
 
+//		printf("%d,%d: N=%d t=0x%hx d=0x%hx\n",i,j,N,t,d);
+//		exit(1);
+
 		color_equals(color_hi[N]);
 		plot(i/2,j*2);
 		color_equals(color_lo[N]);
 		plot(i/2,(j*2)+1);
 		break;
           }
-          // todo: shift and add version of this
 
-          /*
-            if (d < dmin) dmin = d;
-            if (d > dmax) dmax = d;
-            px += d*vxi14 >> 14;
-            py += d*vyi14 >> 14;
-            pz += d*vzi14 >> 14;
-          */
-          {
+
             // 11x1.14 fixed point 3x parallel multiply
             // only 16 bit registers needed; starts from highest bit to lowest
             // d is about 2..1100, so 11 bits are sufficient
             int16_t dx = 0, dy = 0, dz = 0;
             int16_t a = vxi14, b = vyi14, c = vzi14;
+
+	//printf("a=0x%hx b=0x%hx c=0x%hx\n",a,b,c);
             while (d) {
+
               if (d&1024) {
                 dx += a;
                 dy += b;
@@ -158,14 +194,29 @@ int main(int argc, char **argv) {
               a >>= 1;
               b >>= 1;
               c >>= 1;
+		//printf("after mask: a=0x%hx b=0x%hx c=0x%hx\n",a,b,c);
             }
             // we already shifted down 10 bits, so get the last four
+		//printf("before: px=0x%hx py=0x%hx pz=0x%hx\n",px,py,pz);
+		//printf("        dx=0x%hx dy=0x%hx dz=0x%hx\n",dx,dy,dz);
+
             px += dx >> 4;
             py += dy >> 4;
             pz += dz >> 4;
-          }
+		//printf("after : px=0x%hx py=0x%hx pz=0x%hx\n",px,py,pz);
+		//printf("        dx=0x%hx dy=0x%hx dz=0x%hx\n",dx,dy,dz);
+
         }
+// i end
+		vxi14 += xincX;
+		vyi14 -= xincY;
+	 	vzi14 += xincZ;
       }
+// j end
+	ycA += yincC;
+	ysA += yincS;
+
+
     }
 
 		// rotate sines, cosines, and products thereof