Fast 6502 LZSA2 depacker: smaller size, same speed

2024-11-25 10:30:45 +00:00 · 2020-01-03 10:01:23 +01:00 · 2020-01-03 10:01:23 +01:00 · 410544f4e6
commit 410544f4e6
parent 8721c11041
1 changed files with 124 additions and 65 deletions
--- a/asm/6502/decompress_faster_v2.asm
+++ b/asm/6502/decompress_faster_v2.asm
@ -5,10 +5,12 @@
 ;
 ; NMOS 6502 decompressor for data stored in Emmanuel Marty's LZSA2 format.
 ;
+; This code is written for the ACME assembler.
+;
 ; Optional code is presented for two minor 6502 optimizations that break
 ; compatibility with the current LZSA2 format standard.
 ;
-; This code is written for the ACME assembler.
+; The code is 241 bytes for the small version, and 256 bytes for the normal.
 ;
 ; Copyright John Brandwood 2019.
 ;
@ -46,12 +48,34 @@ LZSA_SWAP_LEN16 =       0
 LZSA_SWAP_XZY   =       0

                ;
-                ; Remove code inlining to save space?
-                ;
-                ; This saves 15 bytes of code, but decompression is 7% slower.
+                ; Choose size over space (within sane limits)?
                ;

-LZSA_BEST_SIZE  =       0
+LZSA_SMALL_SIZE =       0
+
+                ;
+                ; Remove code inlining to save space?
+                ;
+                ; This saves 15 bytes of code at the cost of 7% speed.
+                ;
+
+                !if      LZSA_SMALL_SIZE {
+LZSA_NO_INLINE  =       1
+                } else {
+LZSA_NO_INLINE  =       0
+                }
+
+                ;
+                ; Use smaller code for copying literals?
+                ;
+                ; This saves 11 bytes of code at the cost of 5% speed.
+                ;
+
+                !if      LZSA_SMALL_SIZE {
+LZSA_SHORT_CP   =       1
+                } else {
+LZSA_SHORT_CP   =       1
+                }

                ;
                ; Assume that we're decompessing from a large multi-bank
@ -66,62 +90,61 @@ LZSA_FROM_BANK  =       0
                ;

                !if     LZSA_FROM_BANK {
-
-                   !macro LZSA_INC_PAGE {
-                      jsr     .next_page
-                   }
-
+                        !macro  LZSA_INC_PAGE {
+                        jsr     .next_page
+                        }
                } else {
-
-                   !macro LZSA_INC_PAGE {
-                      inc     <lzsa_srcptr + 1
-                   }
-
+                        !macro LZSA_INC_PAGE {
+                        inc     <lzsa_srcptr + 1
+                        }
                }

                ;
                ; Macro to read a byte from the compressed source data.
                ;

-                !if     LZSA_BEST_SIZE {
+                !if     LZSA_NO_INLINE {

-                   !macro LZSA_GET_SRC {
-                      jsr     .get_byte
-                   }
+                        !macro  LZSA_GET_SRC {
+                        jsr     lzsa2_get_byte
+                        }

                } else {

-                   !macro LZSA_GET_SRC {
-                      lda     (lzsa_srcptr),y
-                      inc     <lzsa_srcptr + 0
-                      bne     .skip
-                      +LZSA_INC_PAGE
+                        !macro  LZSA_GET_SRC {
+                        lda     (lzsa_srcptr),y
+                        inc     <lzsa_srcptr + 0
+                        bne     .skip
+                        +LZSA_INC_PAGE
 .skip:
-                   }
+                        }

                }

                ;
                ; Macro to speed up reading 50% of nibbles.
                ;
+                ; This seems to save very few cycles compared to the
+                ; increase in code size, and it isn't recommended.
+                ;

 LZSA_SLOW_NIBL  =       1

-                !if     (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+                !if     (LZSA_SLOW_NIBL + LZSA_SMALL_SIZE) {

-                   !macro LZSA_GET_NIBL {
-                      jsr     lzsa2_get_nibble        ; Always call a function.
-                   }
+                        !macro  LZSA_GET_NIBL {
+                        jsr     lzsa2_get_nibble        ; Always call a function.
+                        }

                } else {

-                   !macro LZSA_GET_NIBL {
-                      lsr     <lzsa_nibflg            ; Is there a nibble waiting?
-                      lda     <lzsa_nibble            ; Extract the lo-nibble.
-                      bcs     .skip\@
-                      jsr     .new_nibble             ; Extract the hi-nibble.
-      .skip\@:        ora     #$F0
-                   }
+                        !macro  LZSA_GET_NIBL {
+                        lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+                        lda     <lzsa_nibble            ; Extract the lo-nibble.
+                        bcs     .skip
+                        jsr     lzsa2_new_nibble        ; Extract the hi-nibble.
+.skip:                  ora     #$F0
+                        }

                }

@ -141,15 +164,19 @@ lzsa_winptr     =       $FA                     ; 1 word.
 lzsa_srcptr     =       $FC                     ; 1 word.
 lzsa_dstptr     =       $FE                     ; 1 word.

+lzsa_length     =       lzsa_winptr             ; 1 word.
+
 LZSA_SRC_LO     =       $FC
 LZSA_SRC_HI     =       $FD
 LZSA_DST_LO     =       $FE
 LZSA_DST_HI     =       $FF

+
+
 ; ***************************************************************************
 ; ***************************************************************************
 ;
-; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2b format.
+; lzsa2_unpack - Decompress data stored in Emmanuel Marty's LZSA2 format.
 ;
 ; Args: lzsa_srcptr = ptr to compessed data
 ; Args: lzsa_dstptr = ptr to output buffer
@ -182,20 +209,45 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.

                jsr     .get_length             ; X=0 table index for literals.

+                !if     LZSA_SHORT_CP {
+
+.got_cp_len:    cmp     #0                      ; Check the lo-byte of length.
+                beq     .put_cp_len
+
+                inx                             ; Increment # of pages to copy.
+
+.put_cp_len:    stx     <lzsa_length
+                tax
+
+.cp_page:       lda     (lzsa_srcptr),y
+                sta     (lzsa_dstptr),y
+                inc     <lzsa_srcptr + 0
+                bne     .skip1
+                inc     <lzsa_srcptr + 1
+.skip1:         inc     <lzsa_dstptr + 0
+                bne     .skip2
+                inc     <lzsa_dstptr + 1
+.skip2:         dex
+                bne     .cp_page
+                dec     <lzsa_length            ; Any full pages left to copy?
+                bne     .cp_page
+
+                } else {
+
 .got_cp_len:    tay                             ; Check the lo-byte of length.
                beq     .cp_page

                inx                             ; Increment # of pages to copy.

-.get_cp_src:    clc                             ; Calc source for partial
-                adc     <lzsa_srcptr + 0        ; page.
+.get_cp_src:    clc                             ; Calc address of partial page.
+                adc     <lzsa_srcptr + 0
                sta     <lzsa_srcptr + 0
                bcs     .get_cp_dst
                dec     <lzsa_srcptr + 1

 .get_cp_dst:    tya
-                clc                             ; Calc destination for partial
-                adc     <lzsa_dstptr + 0        ; page.
+                clc                             ; Calc address of partial page.
+                adc     <lzsa_dstptr + 0
                sta     <lzsa_dstptr + 0
                bcs     .get_cp_idx
                dec     <lzsa_dstptr + 1
@ -214,7 +266,9 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                dex                             ; Any full pages left to copy?
                bne     .cp_page

-                !if      LZSA_SWAP_XZY {
+                }
+
+                !if     LZSA_SWAP_XZY {

                ;
                ; Shorter and faster path with NEW order of bits.
@ -240,10 +294,10 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                bne     .get_9_bits

 .get_5_bits:    dex                             ; X=$FF
-.get_13_bits:   LZSA_GET_NIBL                   ; Always returns with CS.
+.get_13_bits:   +LZSA_GET_NIBL                  ; Always returns with CS.
                bvc     .get_5_skip
                clc
-.get_5_skip:    rol     a                       ; Shift into position, set C.
+.get_5_skip:    rol                             ; Shift into position, set C.
                cpx     #$00                    ; X=$FF for a 5-bit offset.
                bne     .set_offset
                sbc     #2                      ; Subtract 512 because 13-bit
@ -264,7 +318,7 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ;
                ; Slower and longer path with STD order of bits.
                ;
-                ; Z80  NES
+                ; STD  NEW
                ; ================================
                ; xyz  xzy
                ; 00z  0z0  5-bit offset
@ -274,6 +328,10 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ; 111  111  repeat offset
                ;      NVZ  for a BIT instruction
                ;
+                ; N.B. Costs 3 bytes in code length.
+                ;      get5 and get13 are 8 cycles slower.
+                ;      get9, get16, and rep are 4 cycles slower.
+                ;

 .lz_offset:     lda     <lzsa_cmdbuf
                asl
@ -313,7 +371,7 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ; N.B. X=0 is expected and guaranteed when we get here.
                ;

-.get_16_bits:   jsr     .get_byte               ; Get hi-byte of offset.
+.get_16_bits:   jsr     lzsa2_get_byte          ; Get hi-byte of offset.
                tax

 .get_low8:      +LZSA_GET_SRC                   ; Get lo-byte of offset.
@ -379,8 +437,6 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ;
                ; Get 16-bit length in X:A register pair.
                ;
-                ; N.B. Requires reversal of bytes in 16-bit length.
-                ;

 .get_length:    +LZSA_GET_NIBL
                cmp     #$FF                    ; Extended length?
@ -390,34 +446,35 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
 .got_length:    ldx     #$00                    ; Set hi-byte of 4 & 8 bit
                rts                             ; lengths.

-.byte_length:   jsr     .get_byte               ; So rare, this can be slow!
+.byte_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                adc     .byte_len_tbl,x         ; Always CS from previous CMP.
                bcc     .got_length
                beq     .finished

                !if      LZSA_SWAP_LEN16 {

-.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+.word_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                tax

                } else {

-.word_length:   jsr     .get_byte               ; So rare, this can be slow!
+.word_length:   jsr     lzsa2_get_byte          ; So rare, this can be slow!
                pha
-                jsr     .get_byte               ; So rare, this can be slow!
+                jsr     lzsa2_get_byte          ; So rare, this can be slow!
                tax
                pla
                rts

                }

-.get_byte:      lda     (lzsa_srcptr),y         ; Subroutine version for when
+lzsa2_get_byte: 
+                lda     (lzsa_srcptr),y         ; Subroutine version for when
                inc     <lzsa_srcptr + 0        ; inlining isn't advantageous.
                beq     .next_page
                rts

 .next_page:     inc     <lzsa_srcptr + 1        ; Inc & test for bank overflow.
-                !if      LZSA_FROM_BANK {
+                !if     LZSA_FROM_BANK {
                bmi     .next_bank              ; Change for target hardware!
                }
                rts
@ -430,9 +487,10 @@ lzsa2_unpack:   ldy     #0                      ; Initialize source index.
                ; Get a nibble value from compressed data in A.
                ;

-                !if      (LZSA_SLOW_NIBL + LZSA_BEST_SIZE) {
+                !if     (LZSA_SLOW_NIBL | LZSA_SMALL_SIZE) {

-lzsa2_get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting?
+lzsa2_get_nibble:
+                lsr     <lzsa_nibflg            ; Is there a nibble waiting?
                lda     <lzsa_nibble            ; Extract the lo-nibble.
                bcs     .got_nibble

@ -453,13 +511,14 @@ lzsa2_get_nibble:    lsr     <lzsa_nibflg            ; Is there a nibble waiting

                } else {

-.new_nibble:    inc     <lzsa_nibflg            ; Reset the flag.
-                LZSA_GET_SRC
+lzsa2_new_nibble:
+                inc     <lzsa_nibflg            ; Reset the flag.
+                +LZSA_GET_SRC
                sta     <lzsa_nibble            ; Preserve for next time.
-                lsr     a                       ; Extract the hi-nibble.
-                lsr     a
-                lsr     a
-                lsr     a
+                lsr                             ; Extract the hi-nibble.
+                lsr
+                lsr
+                lsr

                !if     LZSA_SWAP_XZY {
                sec                             ; Offset code relies on CS.