Fast 6502 LZSA2 depacker: smaller size, same speed

2024-11-26 02:49:19 +00:00 · 2020-01-03 10:01:23 +01:00 · 2020-01-03 10:01:23 +01:00 · 410544f4e6
commit 410544f4e6
parent 8721c11041
1 changed files with 124 additions and 65 deletions
--- a/asm/6502/decompress_faster_v2.asm
+++ b/asm/6502/decompress_faster_v2.asm
@ -5,10 +5,12 @@
 ;
 ; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
 ;
 ; This code is written for the ACME assembler.
 ;
 ; Optional code is presented for two minor 6502 optimizations that break
 ; compatibility with the current LZSA2 format standard.
 ;
-; This code is written for the ACME assembler.
+; The code is 241 bytes for the small version, and 256 bytes for the normal.
 ;
 ; Copyright John Brandwood 2019.
 ;
@ -46,12 +48,34 @@ LZSA_SWAP_LEN16 =       0
 LZSA_SWAP_XZY   =       0
                ;
-                ; Remove code inlining to save space?
+                ; Choose size over space (within sane limits)?
                ;
                ; This saves 15 bytes of code, but decompression is 7% slower.
                ;
-LZSA_BEST_SIZE  =       0
+LZSA_SMALL_SIZE =       0
                ;
                ; Remove code inlining to save space?
                ;
                ; This saves 15 bytes of code at the cost of 7% speed.
                ;
                !if      LZSA_SMALL_SIZE {
 LZSA_NO_INLINE  =       1
                } else {
 LZSA_NO_INLINE  =       0
                }
                ;
                ; Use smaller code for copying literals?
                ;
                ; This saves 11 bytes of code at the cost of 5% speed.
                ;
                !if      LZSA_SMALL_SIZE {
 LZSA_SHORT_CP   =       1
                } else {
 LZSA_SHORT_CP   =       1
                }
                ;
                ; Assume that we're decompessing from a large multi-bank
@ -66,27 +90,23 @@ LZSA_FROM_BANK  =       0
                ;
                !if     LZSA_FROM_BANK {
                        !macro  LZSA_INC_PAGE {
                        jsr     .next_page
                        }
                } else {
                        !macro LZSA_INC_PAGE {
                        inc     <lzsa_srcptr + 1
                        }
                }
                ;
                ; Macro to read a byte from the compressed source data.
                ;
-                !if     LZSA_BEST_SIZE {
+                !if     LZSA_NO_INLINE {
                        !macro  LZSA_GET_SRC {
-                      jsr     .get_byte
+                        jsr     lzsa2_get_byte
                        }
                } else {
@ -104,10 +124,13 @@ LZSA_FROM_BANK  =       0
                ;
                ; Macro to speed up reading 50% of nibbles.
                ;
                ; This seems to save very few cycles compared to the
                ; increase in code size, and it isn't recommended.
                ;
 LZSA_SLOW_NIBL  =       1
-                !if     (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+                !if     (LZSA_SLOW_NIBL + LZSA_SMALL_SIZE) {
                        !macro  LZSA_GET_NIBL {
                        jsr     lzsa2_get_nibble        ; Always call a function.
@ -118,9 +141,9 @@ LZSA_SLOW_NIBL  =       1
                        !macro  LZSA_GET_NIBL {
                        lsr     <lzsa_nibflg            ; Is there a nibble waiting?
                        lda     <lzsa_nibble            ; Extract the lo-nibble.
-                      bcs     .skip\@
+                        bcs     .skip
-                      jsr     .new_nibble             ; Extract the hi-nibble.
+                        jsr     lzsa2_new_nibble        ; Extract the hi-nibble.
-      .skip\@:        ora     #$F0
+.skip:                  ora     #$F0
                        }
                }
@ -141,15 +164,19 @@ lzsa_winptr     =       $FA                     ; 1 word.
 lzsa_srcptr     =       $FC                     ; 1 word.
 lzsa_dstptr     =       $FE                     ; 1 word.
 lzsa_length     =       lzsa_winptr             ; 1 word.
 LZSA_SRC_LO     =       $FC
 LZSA_SRC_HI     =       $FD
 LZSA_DST_LO     =       $FE
 LZSA_DST_HI     =       $FF
 ; ***************************************************************************
 ; ***************************************************************************
 ;
-; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format.
+; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
 ;
 ; Args: lzsa_srcptr = ptr to compessed data
 ; Args: lzsa_dstptr = ptr to output buffer
@ -182,20 +209,45 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                jsr     .get_length             ; X=0 table index for literals.
                !if     LZSA_SHORT_CP {
 .got_cp_len:    cmp     #0                      ; Check the lo-byte of length.
                beq     .put_cp_len
                inx                             ; Increment # of pages to copy.
 .put_cp_len:    stx     <lzsa_length
                tax
 .cp_page:       lda     (lzsa_srcptr),y
                sta     (lzsa_dstptr),y
                inc     <lzsa_srcptr + 0
                bne     .skip1
                inc     <lzsa_srcptr + 1
 .skip1:         inc     <lzsa_dstptr + 0
                bne     .skip2
                inc     <lzsa_dstptr + 1
 .skip2:         dex
                bne     .cp_page
                dec     <lzsa_length            ; Any full pages left to copy?
                bne     .cp_page
                } else {
 .got_cp_len:    tay                             ; Check the lo-byte of length.
                beq     .cp_page
                inx                             ; Increment # of pages to copy.
-.get_cp_src:    clc                             ; Calc source for partial
+.get_cp_src:    clc                             ; Calc address of partial page.
-                adc     <lzsa_srcptr + 0        ; page.
+                adc     <lzsa_srcptr + 0
                sta     <lzsa_srcptr + 0
                bcs     .get_cp_dst
                dec     <lzsa_srcptr + 1
 .get_cp_dst:    tya
-                clc                             ; Calc destination for partial
+                clc                             ; Calc address of partial page.
-                adc     <lzsa_dstptr + 0        ; page.
+                adc     <lzsa_dstptr + 0
                sta     <lzsa_dstptr + 0
                bcs     .get_cp_idx
                dec     <lzsa_dstptr + 1
@ -214,6 +266,8 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                dex                             ; Any full pages left to copy?
                bne     .cp_page
                }
                !if     LZSA_SWAP_XZY {
                ;
@ -240,10 +294,10 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                bne     .get_9_bits
 .get_5_bits:    dex                             ; X=$FF
-.get_13_bits:   LZSA_GET_NIBL                   ; Always returns with CS.
+.get_13_bits:   +LZSA_GET_NIBL                  ; Always returns with CS.
                bvc     .get_5_skip
                clc
-.get_5_skip:    rol     a                       ; Shift into position, set C.
+.get_5_skip:    rol                             ; Shift into position, set C.
                cpx     #$00                    ; X=$FF for a 5-bit offset.
                bne     .set_offset
                sbc     #2                      ; Subtract 512 because 13-bit
@ -264,7 +318,7 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ;
                ; Slower and longer path with STD order of bits.
                ;
-                ; Z80  NES
+                ; STD  NEW
                ; ================================
                ; xyz  xzy
                ; 00z  0z0  5-bit offset
@ -274,6 +328,10 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ; 111  111  repeat offset
                ;      NVZ  for a BIT instruction
                ;
                ; N.B. Costs 3 bytes in code length.
                ;      get5 and get13 are 8 cycles slower.
                ;      get9, get16, and rep are 4 cycles slower.
                ;
 .lz_offset:     lda     <lzsa_cmdbuf
                asl
@ -313,7 +371,7 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ; N.B. X=0 is expected and guaranteed when we get here.
                ;
-.get_16_bits:   jsr     .get_byte               ; Get hi-byte of offset.
+.get_16_bits:   jsr     lzsa2_get_byte          ; Get hi-byte of offset.
                tax
 .get_low8:      +LZSA_GET_SRC                   ; Get lo-byte of offset.
@ -379,8 +437,6 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ;
                ; Get 16-bit length in X:A register pair.
                ;
                ; N.B. Requires reversal of bytes in 16-bit length.
                ;
 .get_length:    +LZSA_GET_NIBL
                cmp     #$FF                    ; Extended length?
@ -390,28 +446,29 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
 .got_length:    ldx     #$00                    ; Set hi-byte of 4 & 8 bit
                rts                             ; lengths.
-.byte_length:   jsr     .get_byte               ; So rare, this can be slow!
+.byte_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                adc     .byte_len_tbl,x         ; Always CS from previous CMP.
                bcc     .got_length
                beq     .finished
                !if      LZSA_SWAP_LEN16 {
-.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+.word_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                tax
                } else {
-.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+.word_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                pha
-                jsr     .get_byte               ; So rare, this can be slow!
+                jsr     lzsa2_get_byte          ; So rare, this can be slow!
                tax
                pla
                rts
                }
-.get_byte:      lda     (lzsa_srcptr),y         ; Subroutine version for when
+lzsa2_get_byte: 
                lda     (lzsa_srcptr),y         ; Subroutine version for when
                inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
                beq     .next_page
                rts
@ -430,9 +487,10 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ; Get a nibble value from compressed data in A.
                ;
-                !if      (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+                !if     (LZSA_SLOW_NIBL | LZSA_SMALL_SIZE) {
-lzsa2_get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+lzsa2_get_nibble:
                lsr     <lzsa_nibflg            ; Is there a nibble waiting?
                lda     <lzsa_nibble            ; Extract the lo-nibble.
                bcs     .got_nibble
@ -453,13 +511,14 @@ lzsa2_get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting
                } else {
-.new_nibble:    inc     <lzsa_nibflg            ; Reset the flag.
+lzsa2_new_nibble:
-                LZSA_GET_SRC
+                inc     <lzsa_nibflg            ; Reset the flag.
                +LZSA_GET_SRC
                sta     <lzsa_nibble            ; Preserve for next time.
-                lsr     a                       ; Extract the hi-nibble.
+                lsr                             ; Extract the hi-nibble.
-                lsr     a
+                lsr
-                lsr     a
+                lsr
-                lsr     a
+                lsr
                !if     LZSA_SWAP_XZY {
                sec                             ; Offset code relies on CS.