Optimizations to all the 8088 assembly code. UNTESTED

2024-06-14 14:29:30 +00:00 · 2021-02-17 01:23:01 +01:00 · 2021-02-17 01:23:01 +01:00 · 0ec6a3748e
commit 0ec6a3748e
parent 8075b5ab68
7 changed files with 160 additions and 117 deletions
--- a/asm/8088/LZSA1FTA.ASM
+++ b/asm/8088/LZSA1FTA.ASM
@ -1,4 +1,4 @@
-;  lzsa1fta.asm time-efficient decompressor implementation for 8088
+;  LZSA1FTA.ASM time-efficient decompressor implementation for 8088
 ;  Turbo Assembler IDEAL mode dialect; can also be assembled with NASM.
 ;
 ;  Usual DOS assembler SMALL model assumptions apply.  This code:
@ -8,6 +8,7 @@
 ;  - Trashes all data and segment registers
 ;
 ;  Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+;  Additional optimizations by Krister Nordvall
 ;
 ;  This software is provided 'as-is', without any express or implied
 ;  warranty.  In no event will the authors be held liable for any damages
@ -77,29 +78,29 @@ lzsa1_start:
        jnc     @@got_literals_exact ;if no overflow, we have full count
        je      @@big_literals

-@@mid_literals:
+;@@mid_literals:
        lodsb                   ;grab single extra length byte
-        inc     ah              ;add 256
+;       inc     ah              ;add 256
        xchg    cx,ax           ;with longer counts, we can save some time
-        shr     cx,1            ;by doing a word copy instead of a byte copy.
+        rcr     cl,1            ;by doing a word copy instead of a byte copy.
        rep     movsw           ;We don't need to account for overlap because
-        adc     cx,0            ;source for literals isn't the output buffer.
-        rep     movsb
-        jmp     @@check_offset_size
+;       adc     cx,0            ;source for literals isn't the output buffer.
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
+        jmp     @@copy_odd_byte

@@big_literals:
        lodsw                   ;grab 16-bit extra length
        xchg    cx,ax           ;with longer counts, we can save some time
        shr     cx,1            ;by doing a word copy instead of a byte copy.
        rep     movsw
-        adc     cx,0
-        rep     movsb
-        jmp     @@check_offset_size
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
+        jmp     @@copy_odd_byte

@@got_literals:
        segcs   xlat            ;shift literals length into place
@@got_literals_exact:
        xchg    cx,ax
+@@copy_odd_byte:
        rep     movsb           ;copy cx literals from ds:si to es:di

@@check_offset_size:
@ -122,8 +123,16 @@ lzsa1_start:

        lodsw                   ;grab 16-bit length
        xchg    cx,ax           ;get ready to do a long copy
-        jcxz    @@done_decompressing ;wait, is it the EOD marker? Exit if so
-        jmp     @@copy_len_preset ;otherwise, do the copy
+;       jcxz    @@done_decompressing ;wait, is it the EOD marker? Exit if so
+;       jmp     @@copy_len_preset ;otherwise, do the copy
+        inc     cx
+		loop    @@copy_len_preset
+
+;@@done_decompressing:
+        pop     ax              ;retrieve the original decompression offset
+        xchg    di,ax           ;compute decompressed size
+        sub     ax,di
+        ret                     ;done decompressing, exit to caller

@@get_long_offset:
        lodsw                   ;Get 2-byte match offset
@ -143,12 +152,6 @@ lzsa1_start:
        mov     ds,bp           ;restore ds
        jmp     @@decode_token  ;go decode another token

-@@done_decompressing:
-        pop     ax              ;retrieve the original decompression offset
-        xchg    di,ax           ;compute decompressed size
-        sub     ax,di
-        ret                     ;done decompressing, exit to caller
-
 ;With a confirmed longer match length, we have an opportunity to optimize for
 ;the case where a single byte is repeated long enough that we can benefit
 ;from rep movsw to perform the run (instead of rep movsb).
@ -175,7 +178,7 @@ lzsa1_start:
 ;This won't affect 8088 that much, but it speeds up 8086 and higher.
        shr     cx,1
        rep     movsw
-        adc     cx,0
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
        rep     movsb
        mov     si,bp           ;restore si
        pop     ds
@ -189,7 +192,7 @@ lzsa1_start:
        mov     ah,al
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
        rep     stosb           ;finish word run
        mov     si,bp           ;restore si
        pop     ds
@ -199,7 +202,8 @@ lzsa1_start:
        lodsw                   ;load first word of run
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0            ;despite 2-byte offset, compressor might
+;       adc     cx,0            ;despite 2-byte offset, compressor might
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
        rep     stosb           ;output odd length. better safe than sorry.
        mov     si,bp           ;restore si
        pop     ds
--- a/asm/8088/LZSA1JMP.ASM
+++ b/asm/8088/LZSA1JMP.ASM
@ -1,6 +1,5 @@
-; lzsa2fta.asm time-efficient decompressor implementation for 808x CPUs.
+; LZSA1JMP.ASM time-efficient decompressor implementation for 808x CPUs.
 ; Turbo Assembler IDEAL mode dialect.
-; (Is supposed to also assemble with NASM's IDEAL mode support, but YMMV.)
 ;
 ; This code assembles to about 3K of lookup tables and unrolled code,
 ; but the tradeoff for that size is the absolute fastest decompressor
@ -15,7 +14,7 @@
 ; - Trashes all data and segment registers
 ;
 ; Copyright (C) 2019 Jim Leonard, Emmanuel Marty
-; Additional speed optimizations by Pavel Zagrebin
+; Additional speed optimizations by Pavel Zagrebin, Krister Nordvall
 ;
 ; This software is provided 'as-is', without any express or implied
 ; warranty.  In no event will the authors be held liable for any damages
@ -295,14 +294,27 @@ MACRO do_match_copy
        jmp     decode_token
 ENDM

-MACRO do_literal_copy
+MACRO do_literal_copy_shr
 LOCAL even
 ; Copies a literal sequence using words.
 ; Meant for longer lengths; for 128 bytes or less, use REP MOVSB.
 ; requirements: cx=length, ds:si=compdata, es:di=output
 ; must leave cx=0 at exit
        shr     cx,1
-        jnc even
+        jnc     even
+        movsb
+even:
+        rep     movsw
+ENDM
+
+MACRO do_literal_copy_rcr
+LOCAL even
+; Copies a literal sequence using words.
+; Meant for longer lengths; for 128 bytes or less, use REP MOVSB.
+; requirements: (cf << 8 | cl)=length, ds:si=compdata, es:di=output
+; must leave cx=0 at exit
+        rcr     cl,1
+        jnc     even
        movsb
 even:
        rep     movsw
@ -403,21 +415,22 @@ lit_ext_mat_len_1b:
 ;       jz      @@val249_3      ;if zf & cf, 249: get 16-bit literal length
        jc      @@val250_3      ;if cf,      250: get extra literal length byte
        xchg    cx,ax           ;otherwise, we have our literal length
-        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        do_literal_copy_shr     ;this might be better as rep movsw !!! benchmark
        get_byte_match_offset
        copy_small_match_len
@@val250_3:
-jz      @@val249_3
+        jz      @@val249_3
        lodsb                   ;ah=0; grab single extra length byte
-        inc     ah              ;ax=256+length byte
+;       inc     ah              ;ax=256+length byte
+;       Instead of 'inc ah' (2 bytes), use the CF in do_literal_copy_rcr
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_rcr
        get_byte_match_offset
        copy_small_match_len
@@val249_3:
        lodsw                   ;grab 16-bit length
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_shr
        get_byte_match_offset
        copy_small_match_len

@ -430,21 +443,22 @@ lit_ext_mat_ext_1b:
 ;       jz      @@val249_4      ;if zf & cf, 249: get 16-bit literal length
        jc      @@val250_4      ;if cf,      250: get extra literal length byte
        xchg    cx,ax           ;otherwise, we have our literal length
-        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        do_literal_copy_shr     ;this might be better as rep movsw !!! benchmark
        get_byte_match_offset
        copy_large_match_len
@@val250_4:
-jz @@val249_4
+        jz      @@val249_4
        lodsb                   ;ah=0; grab single extra length byte
-        inc     ah              ;ax=256+length byte
+;       inc     ah              ;ax=256+length byte
+;       Instead of 'inc ah' (2 bytes), use the CF in do_literal_copy_rcr
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_rcr
        get_byte_match_offset
        copy_large_match_len
@@val249_4:
        lodsw                   ;grab 16-bit length
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_shr
        get_byte_match_offset
        copy_large_match_len

@ -494,21 +508,22 @@ lit_ext_mat_len_2b:
 ;       jz      @@val249_7      ;if zf & cf, 249: get 16-bit literal length
        jc      @@val250_7      ;if cf,      250: get extra literal length byte
        xchg    cx,ax           ;otherwise, we have our literal length
-        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        do_literal_copy_shr     ;this might be better as rep movsw !!! benchmark
        get_word_match_offset
        copy_small_match_len
@@val250_7:
-jz @@val249_7
+        jz      @@val249_7
        lodsb                   ;ah=0; grab single extra length byte
-        inc     ah              ;ax=256+length byte
+;       inc     ah              ;ax=256+length byte
+;       Instead of 'inc ah' (2 bytes), use the CF in do_literal_copy_rcr
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_rcr
        get_word_match_offset
        copy_small_match_len
@@val249_7:
        lodsw                   ;grab 16-bit length
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_shr
        get_word_match_offset
        copy_small_match_len

@ -521,21 +536,22 @@ lit_ext_mat_ext_2b:
 ;       jz      @@val249_8      ;if zf & cf, 249: get 16-bit literal length
        jc      @@val250_8      ;if cf,      250: get extra literal length byte
        xchg    cx,ax           ;otherwise, we have our literal length
-        do_literal_copy         ;this might be better as rep movsw !!! benchmark
+        do_literal_copy_shr     ;this might be better as rep movsw !!! benchmark
        get_word_match_offset
        copy_large_match_len
@@val250_8:
-jz @@val249_8
+        jz      @@val249_8
        lodsb                   ;ah=0; grab single extra length byte
-        inc     ah              ;ax=256+length byte
+;       inc     ah              ;ax=256+length byte
+;       Instead of 'inc ah' (2 bytes), use the CF in do_literal_copy_rcr
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_rcr
        get_word_match_offset
        copy_large_match_len
@@val249_8:
        lodsw                   ;grab 16-bit length
        xchg    cx,ax
-        do_literal_copy
+        do_literal_copy_shr
        get_word_match_offset
        copy_large_match_len

--- a/asm/8088/LZSA2FTA.ASM
+++ b/asm/8088/LZSA2FTA.ASM
@ -1,4 +1,4 @@
-;  lzsa2fta.asm - LZSA v2 time-efficient decompressor implementation for 8088
+;  LZSA2FTA.ASM - LZSA v2 time-efficient decompressor implementation for 8088
 ;  Turbo Assembler IDEAL mode dialect; can also be assembled with NASM.
 ;
 ;  Usual DOS assembler SMALL model assumptions apply.  This code:
@ -8,6 +8,7 @@
 ;  - Trashes all data and segment registers
 ;
 ;  Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+;  Additional optimizations by Krister Nordvall
 ;
 ;  This software is provided 'as-is', without any express or implied
 ;  warranty.  In no event will the authors be held liable for any damages
@ -72,7 +73,7 @@ lzsa2_speed_start:
        mov     bx,0100H        ;bx used by get_nybble

@@decode_token:
-        mov     ax,cx           ;clear ah - cx is zero (and must stay that way)
+        xchg    cx,ax           ;clear ah - cx is zero
        lodsb                   ;read token byte: XYZ|LL|MMMM
        mov     dx,ax           ;keep copy of token in dl

@ -101,28 +102,32 @@ lzsa2_speed_start:
        xchg    cx,ax
        shr     cx,1
        rep     movsw
-        adc     cx,0
+        adc     cx,cx
        rep     movsb
        jmp     @@check_offset

@@got_literals:
        xchg    cx,ax
        rep     movsb           ;copy cx literals from ds:si to es:di
-        jmp     @@check_offset
+;       jmp     @@check_offset
+        db      0B8h            ; Opcode byte for MOV AX, <immed> (faster than jumping)

 ;LZSA2 likes to produce tiny literals of 1 or 2 bytes.  Handle them here.
@@lit2b:movsb
@@lit1b:movsb

@@check_offset:
-        test    dl,dl           ;check match offset mode in token (X bit)
-        js      @@rep_match_or_large_offset
+;       test    dl,dl           ;check match offset mode in token (X bit)
+;       js      @@rep_match_or_large_offset

-        cmp     dl,040H         ;check if this is a 5 or 9-bit offset (Y bit)
-        jnb     @@offset_9_bit
+;       cmp     dl,040H         ;check if this is a 5 or 9-bit offset (Y bit)
+;       jnb     @@offset_9_bit
+
+        test    dl,0C0h
+        js      @@rep_match_or_large_offset
+        jnz     @@offset_9_bit

        ;5 bit offset:
-        xchg    cx,ax           ;clear ah - cx is zero from prior rep movs
        mov     al,020H         ;shift Z (offset bit 4) in place
        and     al,dl
        shl     al,1
@ -131,12 +136,12 @@ lzsa2_speed_start:
        or      al,cl           ;merge nybble
        rol     al,1
        xor     al,0E1H         ;set offset bits 7-5 to 1
-        dec     ah              ;set offset bits 15-8 to 1
+        cbw                     ;set offset bits 15-8 to 1
        jmp     @@get_match_length

@@rep_match_or_16_bit:
        test    dl,020H         ;test bit Z (offset bit 8)
-        jne     @@repeat_match  ;rep-match
+        jnz     @@repeat_match  ;rep-match

        ;16 bit offset:
        lodsw                   ;Get 2-byte match offset
@ -145,18 +150,18 @@ lzsa2_speed_start:

@@offset_9_bit:
        ;9 bit offset:
-        xchg    cx,ax           ;clear ah - cx is zero from prior rep movs
        lodsb                   ;get 8 bit offset from stream in A
-        dec     ah              ;set offset bits 15-8 to 1
+        mov     ah,0FFh         ;set offset bits 15-8 to 1
        test    dl,020H         ;test bit Z (offset bit 8)
-        je      @@get_match_length
+        jz      @@get_match_length
        dec     ah              ;clear bit 8 if Z bit is clear
        jmp     @@get_match_length

@@rep_match_or_large_offset:
-        cmp     dl,0c0H         ;check if this is a 13-bit offset
+;       cmp     dl,0c0H         ;check if this is a 13-bit offset
                                ;or a 16-bit offset/rep match (Y bit)
-        jnb     @@rep_match_or_16_bit
+;       jnb     @@rep_match_or_16_bit
+        jpe     @@rep_match_or_16_bit

        ;13 bit offset:
        mov     ah,020H         ;shift Z (offset bit 12) in place
@ -188,7 +193,7 @@ lzsa2_speed_start:
        lodsb                   ;grab extra length byte
        add     al,018H         ;overflow?
        jnc     @@got_matchlen_big  ;if not, we have entire (big) length
-        je      @@done_decompressing ; detect EOD code
+        jz      @@done_decompressing ; detect EOD code

        lodsw                   ;grab 16-bit length

@ -217,7 +222,7 @@ ENDIF
 ;This won't affect 8088 that much, but it speeds up 8086 and higher.
        shr     cx,1
        rep     movsw
-        adc     cx,0
+        adc     cx,cx
        rep     movsb
        xchg    si,ax
        mov     ds,dx           ;restore ds:si
@ -254,7 +259,7 @@ ENDIF
        mov     ah,al
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0
+        adc     cx,cx
        rep     stosb           ;finish word run
        pop     si
        mov     ds,dx
@ -266,7 +271,7 @@ IF HANDLE_WORD_RUN
        lodsw                   ;load first word of run
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0            ;despite 2-byte offset, compressor might
+        adc     cx,cx           ;despite 2-byte offset, compressor might
        rep     stosb           ;output odd length. better safe than sorry.
        pop     si
        mov     ds,dx
--- a/asm/8088/decompress_small_v1.S
+++ b/asm/8088/decompress_small_v1.S
@ -1,6 +1,7 @@
 ;  decompress_small.S - space-efficient decompressor implementation for 8088
 ;
 ;  Copyright (C) 2019 Emmanuel Marty
+;  Additional optimizations by Krister Nordvall
 ;
 ;  This software is provided 'as-is', without any express or implied
 ;  warranty.  In no event will the authors be held liable for any damages
@ -37,10 +38,10 @@ lzsa1_decompress:
   xor cx,cx

 .decode_token:
-   mov ax,cx               ; clear ah - cx is zero from above or from after rep movsb in .copy_match
+   xchg cx,ax              ; clear ah - cx is zero from above or from after rep movsb in .copy_match
   lodsb                   ; read token byte: O|LLL|MMMM
   mov dx,ax               ; keep token in dl
-   
+
   and al,070H             ; isolate literals length in token (LLL)
   mov cl,4
   shr al,cl               ; shift literals length into place
@ -51,7 +52,7 @@ lzsa1_decompress:
   lodsb                   ; grab extra length byte
   add al,07H              ; add LITERALS_RUN_LEN
   jnc .got_literals       ; if no overflow, we have the full literals count, go copy
-   jne .mid_literals
+   jnz .mid_literals

   lodsw                   ; grab 16-bit extra length
   db 81H                  ; mask inc ah/lodsb
@ -88,11 +89,11 @@ lzsa1_decompress:
   lodsb                   ; grab extra length byte
   add al,012H             ; add MIN_MATCH_SIZE + MATCH_RUN_LEN
   jnc .got_matchlen       ; if no overflow, we have the entire length
-   jne .mid_matchlen       
+   jnz .mid_matchlen

   lodsw                   ; grab 16-bit length
   test ax,ax              ; bail if we hit EOD
-   je short .done_decompressing
+   jz short .done_decompressing

   db 81H                  ; mask inc ah/lodsb
                           ; (*like jmp short .got_literals but faster)
@ -103,7 +104,7 @@ lzsa1_decompress:
 .got_matchlen:
   xchg cx,ax              ; copy match length into cx
   push ds                 ; save ds:si (current pointer to compressed data)
-   xchg si,ax          
+   xchg si,ax
   push es
   pop ds
   mov si,di               ; ds:si now points at back reference in output data
--- a/asm/8088/decompress_small_v2.S
+++ b/asm/8088/decompress_small_v2.S
@ -1,6 +1,7 @@
 ;  decompress_small.S - space-efficient decompressor implementation for 8088
 ;
 ;  Copyright (C) 2019 Emmanuel Marty
+;  Additional optimizations by Krister Nordvall
 ;
 ;  This software is provided 'as-is', without any express or implied
 ;  warranty.  In no event will the authors be held liable for any damages
@ -39,10 +40,10 @@ lzsa2_decompress:
   xor bp,bp

 .decode_token:
-   mov ax,cx               ; clear ah - cx is zero from above or from after rep movsb in .copy_match
+   xchg cx,ax              ; clear ah - cx is zero from above or from after rep movsb in .copy_match
   lodsb                   ; read token byte: XYZ|LL|MMMM
   mov dx,ax               ; keep token in dl
-   
+
   and al,018H             ; isolate literals length in token (LL)
   mov cl,3
   shr al,cl               ; shift literals length into place
@ -51,7 +52,7 @@ lzsa2_decompress:
   jne .got_literals       ; no, we have the full literals count from the token, go copy

   call .get_nibble        ; get extra literals length nibble
-   add al,cl               ; add len from token to nibble 
+   add al,cl               ; add len from token to nibble
   cmp al,012H             ; LITERALS_RUN_LEN_V2 + 15 ?
   jne .got_literals       ; if not, we have the full literals count, go copy

@ -82,7 +83,7 @@ lzsa2_decompress:
   lodsb                   ; get 8 bit offset from stream in A
   dec ah                  ; set offset bits 15-8 to 1
   test dl,020H            ; test bit Z (offset bit 8)
-   je .get_match_length
+   jz .get_match_length
 .dec_offset_top:
   dec ah                  ; clear bit 8 if Z bit is clear
                           ; or set offset bits 15-8 to 1
@ -102,7 +103,7 @@ lzsa2_decompress:

 .rep_match_or_16_bit:
   test dl,020H            ; test bit Z (offset bit 8)
-   jne .repeat_match       ; rep-match
+   jnz .repeat_match       ; rep-match

                           ; 16 bit offset
   lodsb                   ; Get 2-byte match offset
@ -122,7 +123,7 @@ lzsa2_decompress:
   jne .got_matchlen       ; no, we have the full match length from the token, go copy

   call .get_nibble        ; get extra literals length nibble
-   add al,cl               ; add len from token to nibble 
+   add al,cl               ; add len from token to nibble
   cmp al,018H             ; MIN_MATCH_SIZE_V2 + MATCH_RUN_LEN_V2 + 15?
   jne .got_matchlen       ; no, we have the full match length from the token, go copy

@ -136,7 +137,7 @@ lzsa2_decompress:
 .got_matchlen:
   xchg cx,ax              ; copy match length into cx
   push ds                 ; save ds:si (current pointer to compressed data)
-   xchg si,ax          
+   xchg si,ax
   push es
   pop ds
   lea si,[bp+di]          ; ds:si now points at back reference in output data
@ -163,7 +164,7 @@ lzsa2_decompress:
 .get_nibble:
   neg bh                  ; nibble ready?
   jns .has_nibble
-   
+
   xchg bx,ax
   lodsb                   ; load two nibbles
   xchg bx,ax
--- a/asm/8088/decompress_speed_v1.S
+++ b/asm/8088/decompress_speed_v1.S
@ -8,6 +8,7 @@
 ;  - Trashes all data and segment registers
 ;
 ;  Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+;  Additional optimizations by Krister Nordvall
 ;
 ;  This software is provided 'as-is', without any express or implied
 ;  warranty.  In no event will the authors be held liable for any damages
@ -71,27 +72,27 @@ lzsa1_decompress_speed:

 .mid_literals:
        lodsb                   ;grab single extra length byte
-        inc     ah              ;add 256
+;       inc     ah              ;add 256
        xchg    cx,ax           ;with longer counts, we can save some time
-        shr     cx,1            ;by doing a word copy instead of a byte copy.
+        rcr     cl,1            ;by doing a word copy instead of a byte copy.
        rep     movsw           ;We don't need to account for overlap because
-        adc     cx,0            ;source for literals isn't the output buffer.
-        rep     movsb
-        jmp     .check_offset_size
+;       adc     cx,0            ;source for literals isn't the output buffer.
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
+        jmp     .copy_odd_byte

 .big_literals:
        lodsw                   ;grab 16-bit extra length
        xchg    cx,ax           ;with longer counts, we can save some time
        shr     cx,1            ;by doing a word copy instead of a byte copy.
        rep     movsw
-        adc     cx,0
-        rep     movsb
-        jmp     .check_offset_size
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
+        jmp     .copy_odd_byte

 .got_literals:
-        cs   xlat               ;shift literals length into place
+        cs      xlat            ;shift literals length into place
 .got_literals_exact:
        xchg    cx,ax
+.copy_odd_byte:
        rep     movsb           ;copy cx literals from ds:si to es:di

 .check_offset_size:
@ -114,8 +115,16 @@ lzsa1_decompress_speed:

        lodsw                   ;grab 16-bit length
        xchg    cx,ax           ;get ready to do a long copy
-        jcxz    .done_decompressing ;wait, is it the EOD marker? Exit if so
-        jmp     .copy_len_preset ;otherwise, do the copy
+;       jcxz    .done_decompressing ;wait, is it the EOD marker? Exit if so
+;       jmp     .copy_len_preset ;otherwise, do the copy
+        inc     cx
+		loop    .copy_len_preset
+
+;.done_decompressing:
+        pop     ax              ;retrieve the original decompression offset
+        xchg    di,ax           ;compute decompressed size
+        sub     ax,di
+        ret                     ;done decompressing, exit to caller

 .get_long_offset:
        lodsw                   ;Get 2-byte match offset
@ -135,12 +144,6 @@ lzsa1_decompress_speed:
        mov     ds,bp           ;restore ds
        jmp     .decode_token  ;go decode another token

-.done_decompressing:
-        pop     ax              ;retrieve the original decompression offset
-        xchg    di,ax           ;compute decompressed size
-        sub     ax,di
-        ret                     ;done decompressing, exit to caller
-
 ;With a confirmed longer match length, we have an opportunity to optimize for
 ;the case where a single byte is repeated long enough that we can benefit
 ;from rep movsw to perform the run (instead of rep movsb).
@ -167,7 +170,7 @@ lzsa1_decompress_speed:
 ;This won't affect 8088 that much, but it speeds up 8086 and higher.
        shr     cx,1
        rep     movsw
-        adc     cx,0
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
        rep     movsb
        mov     si,bp           ;restore si
        pop     ds
@ -181,7 +184,7 @@ lzsa1_decompress_speed:
        mov     ah,al
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
        rep     stosb           ;finish word run
        mov     si,bp           ;restore si
        pop     ds
@ -191,7 +194,8 @@ lzsa1_decompress_speed:
        lodsw                   ;load first word of run
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0            ;despite 2-byte offset, compressor might
+;       adc     cx,0            ;despite 2-byte offset, compressor might
+        adc     cx,cx           ; rcl cx,1 might be even faster on 8088
        rep     stosb           ;output odd length. better safe than sorry.
        mov     si,bp           ;restore si
        pop     ds
--- a/asm/8088/decompress_speed_v2.S
+++ b/asm/8088/decompress_speed_v2.S
@ -8,6 +8,7 @@
 ;  - Trashes all data and segment registers
 ;
 ;  Copyright (C) 2019 Jim Leonard, Emmanuel Marty
+;  Additional optimizations by Krister Nordvall
 ;
 ;  This software is provided 'as-is', without any express or implied
 ;  warranty.  In no event will the authors be held liable for any damages
@ -64,17 +65,17 @@ lzsa2_decompress_speed:
        mov     bx,0100H        ;bx used by get_nybble

 .decode_token:
-        mov     ax,cx           ;clear ah - cx is zero (and must stay that way)
+        xchg    cx,ax           ;clear ah - cx is zero
        lodsb                   ;read token byte: XYZ|LL|MMMM
        mov     dx,ax           ;keep copy of token in dl

        and     al,018H         ;isolate literals length in token (LL)
-        jz      .check_offset  ;no literals? stop decoding, go to matches
+        jz      .check_offset   ;no literals? stop decoding, go to matches

 ;At this point, al can be in three (unshifted) states: 1, 2, or 3.
 ;3 = not done yet.
-        cmp     al,(2 << 3)    ;LITERALS_RUN_LEN_V2? (original: cmp al,03h)
-        jb      .lit1b         ;LZSA2 output 1-byte more often, so test first
+        cmp     al,(2 << 3)     ;LITERALS_RUN_LEN_V2? (original: cmp al,03h)
+        jb      .lit1b          ;LZSA2 output 1-byte more often, so test first
        je      .lit2b

        mov     cl,3
@ -82,7 +83,7 @@ lzsa2_decompress_speed:
        get_nybble              ;cl := get extra literals length nybble
        add     al,cl           ;add len from token to nybble
        cmp     al,012H         ;LITERALS_RUN_LEN_V2 + 15 ?
-        jne     .got_literals  ;if not, we have the full literals count
+        jne     .got_literals   ;if not, we have the full literals count
        lodsb                   ;grab extra length byte
        add     al,012H         ;overflow?
        jnc     .got_literals_big ;if not, we have a big full literals count
@ -93,28 +94,36 @@ lzsa2_decompress_speed:
        xchg    cx,ax
        shr     cx,1
        rep     movsw
-        adc     cx,0
+        adc     cx,cx
        rep     movsb
        jmp     .check_offset

 .got_literals:
        xchg    cx,ax
        rep     movsb           ;copy cx literals from ds:si to es:di
-        jmp     .check_offset
+;       jmp     .check_offset
+        db      0B8h            ; Opcode byte for MOV AX, <immed> (faster than jumping)

 ;LZSA2 likes to produce tiny literals of 1 or 2 bytes.  Handle them here.
 .lit2b:movsb
 .lit1b:movsb

 .check_offset:
+%if 0   ; 9 bytes
        test    dl,dl           ;check match offset mode in token (X bit)
        js      .rep_match_or_large_offset

        cmp     dl,040H         ;check if this is a 5 or 9-bit offset (Y bit)
        jnb     .offset_9_bit
+%else   ; 7 bytes
+        ; This is shorter than the above and should be faster overall if
+		; falling through to 5-bit offsets is the most common path.
+        test    dl,0C0h
+        js      .rep_match_or_large_offset
+		jnz     .offset_9_bit
+%endif ; 0

        ;5 bit offset:
-        xchg    cx,ax           ;clear ah - cx is zero from prior rep movs
        mov     al,020H         ;shift Z (offset bit 4) in place
        and     al,dl
        shl     al,1
@ -123,12 +132,12 @@ lzsa2_decompress_speed:
        or      al,cl           ;merge nybble
        rol     al,1
        xor     al,0E1H         ;set offset bits 7-5 to 1
-        dec     ah              ;set offset bits 15-8 to 1
+        cbw                     ;set offset bits 15-8 to 1
        jmp     .get_match_length

 .rep_match_or_16_bit:
        test    dl,020H         ;test bit Z (offset bit 8)
-        jne     .repeat_match  ;rep-match
+        jnz     .repeat_match   ;rep-match

        ;16 bit offset:
        lodsw                   ;Get 2-byte match offset
@ -137,18 +146,21 @@ lzsa2_decompress_speed:

 .offset_9_bit:
        ;9 bit offset:
-        xchg    cx,ax           ;clear ah - cx is zero from prior rep movs
        lodsb                   ;get 8 bit offset from stream in A
-        dec     ah              ;set offset bits 15-8 to 1
+        mov     ah,0FFh         ;set offset bits 15-8 to 1
        test    dl,020H         ;test bit Z (offset bit 8)
-        je      .get_match_length
+        jz      .get_match_length
        dec     ah              ;clear bit 8 if Z bit is clear
        jmp     .get_match_length

 .rep_match_or_large_offset:
+%if 0
        cmp     dl,0c0H         ;check if this is a 13-bit offset
                                ;or a 16-bit offset/rep match (Y bit)
        jnb     .rep_match_or_16_bit
+%else
+        jpe     .rep_match_or_16_bit
+%endif ; 0

        ;13 bit offset:
        mov     ah,020H         ;shift Z (offset bit 12) in place
@ -180,7 +192,7 @@ lzsa2_decompress_speed:
        lodsb                   ;grab extra length byte
        add     al,018H         ;overflow?
        jnc     .got_matchlen_big  ;if not, we have entire (big) length
-        je      .done_decompressing ; detect EOD code
+        jz      .done_decompressing ; detect EOD code

        lodsw                   ;grab 16-bit length

@ -209,7 +221,7 @@ lzsa2_decompress_speed:
 ;This won't affect 8088 that much, but it speeds up 8086 and higher.
        shr     cx,1
        rep     movsw
-        adc     cx,0
+        adc     cx,cx
        rep     movsb
        xchg    si,ax
        mov     ds,dx           ;restore ds:si
@ -246,7 +258,7 @@ lzsa2_decompress_speed:
        mov     ah,al
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0
+        adc     cx,cx
        rep     stosb           ;finish word run
        pop     si
        mov     ds,dx
@ -258,7 +270,7 @@ lzsa2_decompress_speed:
        lodsw                   ;load first word of run
        shr     cx,1
        rep     stosw           ;perform word run
-        adc     cx,0            ;despite 2-byte offset, compressor might
+        adc     cx,cx           ;despite 2-byte offset, compressor might
        rep     stosb           ;output odd length. better safe than sorry.
        pop     si
        mov     ds,dx