shave 61381 cycles from all precomputed 2-bit HGR transitions (~4% faster)

2025-08-10 08:25:13 +00:00 · 2020-09-05 22:06:35 -04:00
parent 9bb4524402
commit f081f94756
1 changed files with 55 additions and 48 deletions
--- a/src/fx/fx.hgr.precomputed.2bit.a
+++ b/src/fx/fx.hgr.precomputed.2bit.a
@@ -1,20 +1,9 @@
 ;license:MIT
-;(c) 2019 by 4am
+;(c) 2019-2020 by 4am
 ;
-mirror_src1    = $E8      ; word
-mirror_dest1   = $EA      ; word
-mirror_src2    = $EC      ; word
-mirror_dest2   = $EE      ; word
-src1           = $F0      ; word
-dest1          = $F2      ; word
-src2           = $F4      ; word
-dest2          = $F6      ; word
-reverse_input  = $FC      ; word
-input          = $FE      ; word
-
 copymasks      = $0200    ; $100 bytes but sparse, index is 0..4 but in high 3 bits, so $00, $20, $40, $60, $80
 mirror_copymasks = $0201
-hgrlo          = $0300    ; $C0 bytes
+hgrlo          = $0301    ; $C0 bytes
 hgrlomirror    = $BD40    ; $C0 bytes
 mirror_cols    = $BE00    ; $28 bytes
 hgr1hi         = $BE40    ; $C0 bytes
@@ -27,7 +16,7 @@ hgr1himirror   = $BF40    ; $C0 bytes
         ldx   #$27
         ldy   #$00
 -        tya
-         sta   .mirror_cols,x
+         sta   .mirror_cols, x
         iny
         dex
         bpl   -
@@ -70,71 +59,89 @@ hgr1himirror   = $BF40    ; $C0 bytes

 !macro ROW_X_TO_2BIT_BASE_ADDRESSES {
         ; X = $01..$C0, mapping to row 0..191
-         lda   hgrlo-1,x
-         sta   dest1
-         sta   src1
-         lda   hgr1hi-1,x
-         sta   dest1+1
+         lda   hgrlo-1, x
+         sta   <dest1
+         sta   <src1
+         lda   hgr1hi-1, x
+         sta   <dest1+1
         eor   #$60
-         sta   src1+1
-         lda   hgrlo,x
-         sta   dest2
-         sta   src2
-         lda   hgr1hi,x
-         sta   dest2+1
+         sta   <src1+1
+         lda   hgrlo, x
+         sta   <dest2
+         sta   <src2
+         lda   hgr1hi, x
+         sta   <dest2+1
         eor   #$60
-         sta   src2+1
+         sta   <src2+1
 }

 !macro HIGH_3_LOW_5 .input {
         and   #%11100000            ; second value: high 3 bits = index into tables to find bitmasks
         tax
-         eor   (.input),y            ; second value: low 5 bits = byte offset within the row (implicitly "and #%00011111")
+         eor   (.input), y           ; second value: low 5 bits = byte offset within the row (implicitly "and #%00011111")
         tay
 }

-!macro INC_INPUT_AND_LOOP .loop {
-         inc   input
-         beq   +
-         jmp   .loop
-+        bit   $c000
-         bmi   +
-         inc   input+1
-         jmp   .loop
-+        rts
-}
-
 !macro FX_PRECOMPUTED_2BIT .coords {
         +BUILD_HGR_LOOKUP_TABLES hgrlo, hgr1hi
         +BUILD_MIRROR_COLS mirror_cols
         +BUILD_SPARSE_BITMASKS_2BIT copymasks, mirror_copymasks
+         ldx   #(end-start)          ; copy InputLoop code to zero page
+-        lda   start-1, x
+         sta   $FF, x
+         dex
+         bne   -
         +LDADDR .coords
-         +ST16 input
-         jmp   InputLoop
+         sta   <input
+         sty   <input+1
+         lda   #0
+         pha
+         pha
+start
+!pseudopc 0 {
 Exit2Bit rts
 InputLoop
         ldy   #0
-         lda   (input),y             ; first value: HGR row + 1
+input=*+1
+         lda   $FDFD, y              ; first value: HGR row + 1
         beq   Exit2Bit              ; if 0 then we're done
         tax
         +ROW_X_TO_2BIT_BASE_ADDRESSES

-         inc   input
-         lda   (input),y
+         inc   <input
+         lda   (<input), y
         +HIGH_3_LOW_5 input

         ; main 2x2 block in left half
-         +COPY_BIT src1, dest1, copymasks
-         +COPY_BIT src2, dest2, copymasks
+src1=*+1
+         lda   $FDFD, y
+         eor   (<dest1), y
+         and   copymasks, x
+         eor   (<dest1), y
+dest1=*+1
+         sta   $FDFD, y
+src2=*+1
+         lda   $FDFD, y
+         eor   (<dest2), y
+         and   copymasks, x
+         eor   (<dest2), y
+dest2=*+1
+         sta   $FDFD, y

         ; corresponding 2x2 block in right half (same row, opposite column)
-         lda   mirror_cols,y
+         lda   mirror_cols, y
         tay
         +COPY_BIT src1, dest1, mirror_copymasks
         +COPY_BIT src2, dest2, mirror_copymasks

-         +INC_INPUT_AND_LOOP InputLoop
-         rts
+         inc   <input
+         bne   InputLoop
+         bit   $c000
+         bmi   Exit2Bit
+         inc   <input+1
+         bne   InputLoop             ; always branches
+}
+end
 !if * and 1 {
         !byte 0 ;align 2 but avoids the fake allocation bug if it was aligned already
 }