From 68d5983a14890440582bb8a7bbef42a4a23f2fc5 Mon Sep 17 00:00:00 2001
From: Irmen de Jong <irmen@razorvine.net>
Date: Wed, 1 Jan 2025 19:06:11 +0100
Subject: [PATCH] optimize monogfx.plot() to use a *40 lookup table in lores
 mode. Speeds up a lot of other routines too (line etc)

---
 compiler/res/prog8lib/cx16/monogfx.p8 | 198 +++++++++++++++++++-------
 docs/source/todo.rst                  |   5 +-
 examples/test.p8                      | 198 +++++++++++++++++++++++++-
 3 files changed, 343 insertions(+), 58 deletions(-)

diff --git a/compiler/res/prog8lib/cx16/monogfx.p8 b/compiler/res/prog8lib/cx16/monogfx.p8
index d49b422e7..cc92ebd81 100644
--- a/compiler/res/prog8lib/cx16/monogfx.p8
+++ b/compiler/res/prog8lib/cx16/monogfx.p8
@@ -16,6 +16,7 @@ monogfx {
     ; read-only control variables:
     uword width = 0
     uword height = 0
+    bool lores_mode
     ubyte mode
     const ubyte MODE_NORMAL  = %00000000
     const ubyte MODE_STIPPLE = %00000001
@@ -32,6 +33,7 @@ monogfx {
         cx16.VERA_L1_TILEBASE = 0
         width = 320
         height = 240
+        lores_mode = true
         mode = MODE_NORMAL
         clear_screen(false)
     }
@@ -47,6 +49,7 @@ monogfx {
         cx16.VERA_L1_TILEBASE = %00000001
         width = 640
         height = 480
+        lores_mode = false
         mode = MODE_NORMAL
         clear_screen(false)
     }
@@ -64,15 +67,12 @@ monogfx {
 
     sub clear_screen(bool draw) {
         position(0, 0)
-        when width {
-            320 -> {
-                repeat 240/2/8
-                    cs_innerloop640(draw)
-            }
-            640 -> {
-                repeat 480/8
-                    cs_innerloop640(draw)
-            }
+        if lores_mode {
+            repeat 240/2/8
+                cs_innerloop640(draw)
+        } else {
+            repeat 480/8
+                cs_innerloop640(draw)
         }
         position(0, 0)
     }
@@ -109,7 +109,7 @@ monogfx {
             return
         if length<=8 {
             ; just use 2 byte writes with shifted mask
-            position2(xx,yy,true)
+            position2(xx,yy)
             %asm {{
                 ldy  p8v_length
                 lda  p8v_masked_ends,y
@@ -309,8 +309,8 @@ _done
          }}
             if mode!=MODE_STIPPLE {
                 ; draw continuous line.
-                position2(xx,yy,true)
-                if width==320
+                position2(xx,yy)
+                if lores_mode
                     set_both_strides(11)    ; 40 increment = 1 line in 320 px monochrome
                 else
                     set_both_strides(12)    ; 80 increment = 1 line in 640 px monochrome
@@ -328,8 +328,8 @@ drawmode:               ora  cx16.r15L
                     lheight--
                 }
                 lheight++   ; because it is divided by 2 later, don't round off the last pixel
-                position2(xx,yy,true)
-                if width==320
+                position2(xx,yy)
+                if lores_mode
                     set_both_strides(12)    ; 80 increment = 2 line in 320 px monochrome
                 else
                     set_both_strides(13)    ; 160 increment = 2 line in 640 px monochrome
@@ -342,9 +342,9 @@ drawmode:               ora  cx16.r15L
                 }
             }
         } else {
-            position2(xx,yy,true)
+            position2(xx,yy)
             cx16.r15 = ~cx16.r15    ; erase pixels
-            if width==320
+            if lores_mode
                 set_both_strides(11)    ; 40 increment = 1 line in 320 px monochrome
             else
                 set_both_strides(12)    ; 80 increment = 1 line in 640 px monochrome
@@ -679,26 +679,51 @@ invert:
         return
 
         sub prepare() {
-            %asm {{
-                lda  p8v_xx
-                and  #7
-                pha     ; xbits
-            }}
-            xx /= 8
-            if width==320
-                xx += yy*(320/8)        ; TODO *40 table
-            else
-                xx += yy*(640/8)        ; TODO *80 table? (a bit large, need lo,mid,hi.  maybe just reuse *40 table and do 1 shift.)
-            %asm {{
-                stz  cx16.VERA_CTRL
-                stz  cx16.VERA_ADDR_H
-                lda  p8v_xx+1
-                sta  cx16.VERA_ADDR_M
-                lda  p8v_xx
-                sta  cx16.VERA_ADDR_L
-                ply     ; xbits
-                lda  p8v_maskbits,y
-            }}
+            if lores_mode {
+                %asm {{
+                    stz  cx16.VERA_CTRL
+                    stz  cx16.VERA_ADDR_H
+
+                    lda  p8v_xx+1
+                    lsr  a
+                    lda  p8v_xx
+                    ror  a
+                    lsr  a
+                    lsr  a
+
+                    clc
+                    ldy  p8v_yy
+                    adc  p8v_times40_lsb,y
+                    sta  cx16.VERA_ADDR_L
+                    lda  p8v_times40_msb,y
+                    adc  #0
+                    sta  cx16.VERA_ADDR_M
+
+                    lda  p8v_xx
+                    and  #7
+                    tax
+                    lda  p8v_maskbits,x
+                }}
+            } else {
+                ; width=640 (hires)
+                %asm {{
+                    stz  cx16.VERA_CTRL
+                    stz  cx16.VERA_ADDR_H
+                    lda  p8v_xx
+                    and  #7
+                    pha     ; xbits
+                }}
+                xx /= 8
+                xx += yy*(640/8)
+                %asm {{
+                    lda  p8v_xx+1
+                    sta  cx16.VERA_ADDR_M
+                    lda  p8v_xx
+                    sta  cx16.VERA_ADDR_L
+                    plx     ; xbits
+                    lda  p8v_maskbits,x
+                }}
+            }
         }
     }
 
@@ -718,10 +743,21 @@ invert:
             pha     ; xbits
         }}
         xx /= 8
-        if width==320
-            xx += yy*(320/8)        ; TODO *40 table
+        if lores_mode {
+            %asm {{
+                ; xx += yy * 40
+                ldy  p8v_yy
+                lda  p8v_xx
+                clc
+                adc  p8v_times40_lsb,y
+                sta  p8v_xx
+                lda  p8v_xx+1
+                adc  p8v_times40_msb,y
+                sta  p8v_xx+1
+            }}
+        }
         else
-            xx += yy*(640/8)        ; TODO *80 table? (a bit large, need lo,mid,hi  maybe just reuse *40 table and do 1 shift.)
+            xx += yy*(640/8)
 
         %asm {{
             stz  cx16.VERA_CTRL
@@ -830,10 +866,21 @@ skip:
                 pha     ; xbits
             }}
             xpos /= 8
-            if width==320
-                xpos += yy*(320/8) as uword     ; TODO *40 table
+            if lores_mode {
+                %asm {{
+                    ; xpos += yy*40
+                    ldy  p8v_yy
+                    lda  p8v_xpos
+                    clc
+                    adc  p8v_times40_lsb,y
+                    sta  p8v_xpos
+                    lda  p8v_xpos+1
+                    adc  p8v_times40_msb,y
+                    sta  p8v_xpos+1
+                }}
+            }
             else
-                xpos += yy*(640/8) as uword     ; TODO *80 table? (a bit large, need lo,mid,hi  maybe just reuse *40 table and do 1 shift.)
+                xpos += yy*(640/8) as uword
 
             %asm {{
                 stz  cx16.VERA_CTRL
@@ -878,20 +925,63 @@ _doplot         beq  +
         }
     }
 
-    sub position(uword @zp xx, uword yy) {
-        if width==320
-            cx16.r0 = yy*(320/8)        ; TODO *40 table
-        else
-            cx16.r0 = yy*(640/8)        ; TODO *80 table? (a bit large, need lo,mid,hi  maybe just reuse *40 table and do 1 shift.)
-        cx16.vaddr(0, cx16.r0+(xx/8), 0, 1)
+    sub position(uword xx, uword yy) {
+        if lores_mode {
+            %asm {{
+                stz  cx16.VERA_CTRL
+                lda  p8v_xx+1
+                lsr  a
+                lda  p8v_xx
+                ror  a
+                lsr  a
+                lsr  a
+                clc
+                ldy  p8v_yy
+                adc  p8v_times40_lsb,y
+                sta  cx16.VERA_ADDR_L
+                lda  p8v_times40_msb,y
+                adc  #0
+                sta  cx16.VERA_ADDR_M
+                lda  #%00010000     ; autoincr
+                sta  cx16.VERA_ADDR_H
+            }}
+        }
+        else {
+            cx16.r0 = yy*(640/8)
+            cx16.vaddr(0, cx16.r0+(xx/8), 0, 1)
+        }
+        return
     }
 
-    sub position2(uword @zp xx, uword yy, bool also_port_1) {
+    sub position2(uword xx, uword yy) {
         position(xx, yy)
-        if also_port_1
-            cx16.vaddr_clone(0)
+        ; also set port 1 like that
+        cx16.vaddr_clone(0)
     }
 
+    ; y*40 lookup table. Pretty compact because it all fits in a word and we only need 240 y positions.
+    ; a y*80 lookup table would be very large (lo,mid,hi for 480 values...)
+    uword[240] @split @shared times40 = [
+        0, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600,
+        640, 680, 720, 760, 800, 840, 880, 920, 960, 1000, 1040, 1080, 1120, 1160,
+        1200, 1240, 1280, 1320, 1360, 1400, 1440, 1480, 1520, 1560, 1600, 1640, 1680,
+        1720, 1760, 1800, 1840, 1880, 1920, 1960, 2000, 2040, 2080, 2120, 2160, 2200,
+        2240, 2280, 2320, 2360, 2400, 2440, 2480, 2520, 2560, 2600, 2640, 2680, 2720,
+        2760, 2800, 2840, 2880, 2920, 2960, 3000, 3040, 3080, 3120, 3160, 3200, 3240,
+        3280, 3320, 3360, 3400, 3440, 3480, 3520, 3560, 3600, 3640, 3680, 3720, 3760,
+        3800, 3840, 3880, 3920, 3960, 4000, 4040, 4080, 4120, 4160, 4200, 4240, 4280,
+        4320, 4360, 4400, 4440, 4480, 4520, 4560, 4600, 4640, 4680, 4720, 4760, 4800,
+        4840, 4880, 4920, 4960, 5000, 5040, 5080, 5120, 5160, 5200, 5240, 5280, 5320,
+        5360, 5400, 5440, 5480, 5520, 5560, 5600, 5640, 5680, 5720, 5760, 5800, 5840,
+        5880, 5920, 5960, 6000, 6040, 6080, 6120, 6160, 6200, 6240, 6280, 6320, 6360,
+        6400, 6440, 6480, 6520, 6560, 6600, 6640, 6680, 6720, 6760, 6800, 6840, 6880,
+        6920, 6960, 7000, 7040, 7080, 7120, 7160, 7200, 7240, 7280, 7320, 7360, 7400,
+        7440, 7480, 7520, 7560, 7600, 7640, 7680, 7720, 7760, 7800, 7840, 7880, 7920,
+        7960, 8000, 8040, 8080, 8120, 8160, 8200, 8240, 8280, 8320, 8360, 8400, 8440,
+        8480, 8520, 8560, 8600, 8640, 8680, 8720, 8760, 8800, 8840, 8880, 8920, 8960,
+        9000, 9040, 9080, 9120, 9160, 9200, 9240, 9280, 9320, 9360, 9400, 9440, 9480,
+        9520, 9560]
+
     const ubyte charset_bank = $1
     const uword charset_addr = $f000       ; in bank 1, so $1f000
 
@@ -948,7 +1038,7 @@ _doplot         beq  +
                 bne  --
             }}
             ; left part of shifted char
-            position2(xx, yy, true)
+            position2(xx, yy)
             set_autoincrs()
             if draw {
                 %asm {{
@@ -974,7 +1064,7 @@ cdraw_mod1          ora  cx16.VERA_DATA1
             }
             ; right part of shifted char
             if lsb(xx) & 7 !=0 {
-                position2(xx+8, yy, true)
+                position2(xx+8, yy)
                 set_autoincrs()
                 if draw {
                     %asm {{
@@ -1005,7 +1095,7 @@ cdraw_mod2              ora  cx16.VERA_DATA1
 
         sub set_autoincrs() {
             ; set autoincrements to go to next pixel row (40 or 80 increment)
-            if width==320 {
+            if lores_mode {
                 cx16.VERA_CTRL = 1
                 cx16.VERA_ADDR_H = cx16.VERA_ADDR_H & $0f | (11<<4)
                 cx16.VERA_CTRL = 0
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 210bda1c7..9ee8d50e3 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,10 @@
 TODO
 ====
 
+- optimize word addition  word += mul40[indexbyte]  to use adc with register indexed instructions
+- same with other operators (sbc, and, or, eor)
+- how is tye codegen for byte values here?
+
 - add paypal donation button as well?
 - announce prog8 on the 6502.org site?
 
@@ -74,7 +78,6 @@ IR/VM
 Libraries
 ---------
 - Sorting module gnomesort_uw could be optimized more, rewrite in asm? Shellshort seems consistently faster even if most of the words are already sorted.
-- Monogfx: use *40 multiplication lookup tables (possibly *80 as well? but those are a bit large; needing lo,mid,hi)
 - Add split-word array sorting routines to sorting module?
 - pet32 target: make syslib more complete (missing kernal routines)?
 - need help with: PET disk routines (OPEN, SETLFS etc are not exposed as kernal calls)
diff --git a/examples/test.p8 b/examples/test.p8
index 192c8771e..05838d8b6 100644
--- a/examples/test.p8
+++ b/examples/test.p8
@@ -1,9 +1,201 @@
-%zeropage basicsafe
+%import textio
+%import math
+%import monogfx
+%import gfx_lores
 %option no_sysinit
 
 main {
     sub start() {
-        if not unexistingsymbol
-            cx16.r0++
+
+        math.rndseed(1234,8877)
+        cbm.SETTIM(0,0,0)
+        kernal()
+        txt.print_uw(cbm.RDTIM16())
+
+        sys.wait(200)
+
+        math.rndseed(1234,8877)
+        cbm.SETTIM(0,0,0)
+        custom_256()
+        txt.print_uw(cbm.RDTIM16())
+
+        sys.wait(200)
+
+
+        math.rndseed(1234,8877)
+        cbm.SETTIM(0,0,0)
+        custom_mono()
+        txt.print_uw(cbm.RDTIM16())
+
+        repeat {
+        }
     }
+
+    sub kernal() {
+        cx16.set_screen_mode(128)
+        cx16.GRAPH_set_colors(2,1,0)
+        cx16.GRAPH_clear()
+        repeat 1000 {
+            cx16.r0 = math.rndw() % 320
+            cx16.r1 = math.rnd() % 240
+            cx16.r2 = math.rndw() % 320
+            cx16.r3 = math.rnd() % 240
+            cx16.GRAPH_draw_line(cx16.r0, cx16.r1, cx16.r2, cx16.r3)
+        }
+        cx16.set_screen_mode(0)
+    }
+
+    sub custom_mono() {
+        monogfx.lores()
+        repeat 1000 {
+            cx16.r0 = math.rndw() % 320
+            cx16.r1L = math.rnd() % 240
+            cx16.r2 = math.rndw() % 320
+            cx16.r3L = math.rnd() % 240
+            line(cx16.r0, cx16.r1L, cx16.r2, cx16.r3L)
+        }
+        monogfx.textmode()
+    }
+
+    sub custom_256() {
+        gfx_lores.graphics_mode()
+        repeat 1000 {
+            cx16.r0 = math.rndw() % 320
+            cx16.r1L = math.rnd() % 240
+            cx16.r2 = math.rndw() % 320
+            cx16.r3L = math.rnd() % 240
+            gfx_lores.line(cx16.r0, cx16.r1L, cx16.r2, cx16.r3L, 2)
+        }
+        gfx_lores.text_mode()
+    }
+
+    sub line(uword @zp x1, ubyte @zp y1, uword @zp x2, ubyte @zp y2) {
+        ; Bresenham algorithm.
+        ; This code special-cases various quadrant loops to allow simple ++ and -- operations.
+        if y1>y2 {
+            ; make sure dy is always positive to have only 4 instead of 8 special cases
+            cx16.r0 = x1
+            x1 = x2
+            x2 = cx16.r0
+            cx16.r0L = y1
+            y1 = y2
+            y2 = cx16.r0L
+        }
+        word @zp dx = (x2 as word)-x1
+        ubyte @zp dy = y2-y1
+
+        if dx==0 {
+            monogfx.vertical_line(x1, y1, abs(dy) as uword +1, true)
+            return
+        }
+        if dy==0 {
+            if x1>x2
+                x1=x2
+            monogfx.horizontal_line(x1, y1, abs(dx) as uword +1, true)
+            return
+        }
+
+        cx16.r1L = 1 ;; true      ; 'positive_ix'
+        if dx < 0 {
+            dx = -dx
+            cx16.r1L = 0 ;; false
+        }
+        word @zp dx2 = dx*2
+        word @zp dy2 = dy*2
+        word @zp d = 0
+        cx16.VERA_CTRL = 0
+        cx16.VERA_ADDR_H = 0
+        if dx >= dy {
+            if cx16.r1L!=0 {
+                repeat {
+                    plot()
+                    if x1==x2
+                        return
+                    x1++
+                    d += dy2
+                    if d > dx {
+                        y1++
+                        d -= dx2
+                    }
+                }
+            } else {
+                repeat {
+                    plot()
+                    if x1==x2
+                        return
+                    x1--
+                    d += dy2
+                    if d > dx {
+                        y1++
+                        d -= dx2
+                    }
+                }
+            }
+        }
+        else {
+            if cx16.r1L!=0 {
+                repeat {
+                    plot()
+                    if y1==y2
+                        return
+                    y1++
+                    d += dx2
+                    if d > dy {
+                        x1++
+                        d -= dy2
+                    }
+                }
+            } else {
+                repeat {
+                    plot()
+                    if y1==y2
+                        return
+                    y1++
+                    d += dx2
+                    if d > dy {
+                        x1--
+                        d -= dy2
+                    }
+                }
+            }
+        }
+
+        asmsub plot() {
+            %asm {{
+                lda  p8v_x1+1
+                lsr  a
+                lda  p8v_x1
+                ror  a
+                lsr  a
+                lsr  a
+
+                clc
+                ldy  p8v_y1
+                adc  times40_lo,y
+                sta  cx16.VERA_ADDR_L
+                lda  times40_mid,y
+                adc  #0
+                sta  cx16.VERA_ADDR_M
+
+                lda  p8v_x1
+                and  #7
+                tax
+                lda  maskbits,x
+                tsb  cx16.VERA_DATA0
+                rts
+
+maskbits    .byte  128,64,32,16,8,4,2,1
+; multiplication by 40 lookup table
+times40 := 40*range(240)
+
+times40_lo     .byte <times40
+times40_mid    .byte >times40
+
+            ; !notreached!
+    }}
+        }
+
+    }
+
+
 }