diff --git a/asm/8088/LZSA1JMP.ASM b/asm/8088/LZSA1JMP.ASM index b96b498..a1eac9f 100644 --- a/asm/8088/LZSA1JMP.ASM +++ b/asm/8088/LZSA1JMP.ASM @@ -1,4 +1,4 @@ -; lzsa1fta.asm time-efficient decompressor implementation for 808x CPUs. +; lzsa2fta.asm time-efficient decompressor implementation for 808x CPUs. ; Turbo Assembler IDEAL mode dialect. ; (Is supposed to also assemble with NASM's IDEAL mode support, but YMMV.) ; @@ -15,6 +15,7 @@ ; - Trashes all data and segment registers ; ; Copyright (C) 2019 Jim Leonard, Emmanuel Marty +; Additional speed optimizations by Pavel Zagrebin ; ; This software is provided 'as-is', without any express or implied ; warranty. In no event will the authors be held liable for any damages @@ -107,7 +108,8 @@ ; the 'M' bits in the token form the value 15, and an extra byte follows here, ; with three possible types of value. ; -; 0-237: the value is added to the 15 stored in the token. The final value is 3 + 15 + this byte. +; 0-237: the value is added to the 15 stored in the token. +; The final value is 3 + 15 + this byte. ; 239: a second byte follows. The final match length is 256 + the second byte. ; 238: a second and third byte follow, forming a little-endian 16-bit value. ; The final encoded match length is that 16-bit value. @@ -121,6 +123,14 @@ ; have the most code, but these are uncommon paths so the ; tiny speed loss in just these paths is not a concern. +;Setting OPTIMIZE_LONG_RLE to 1 speeds up decompressing long runs of the +;same 16-bit word value, but hurts decompression speed of other data +;types slightly. Turn this on if you know your data has very long 16-bit +;word-based runs (reported as RLE2 sequences in the LZSA compressor output +;with an average length of at least 32 bytes), otherwise leave it off. + +OPTIMIZE_LONG_RLE EQU 0 + SEGMENT CODE para public ASSUME cs:CODE, ds:CODE @@ -138,43 +148,35 @@ leml2 EQU OFFSET lit_ext_mat_len_2b leme2 EQU OFFSET lit_ext_mat_ext_2b ;short-circuit special cases for 0 through 6 literal copies: -l6ml1 EQU OFFSET lit_len_mat_len_1b +l6ml1 EQU OFFSET lit_len_mat_len_1b_6 l6me1 EQU OFFSET lit_len_mat_ext_1b -l6ml2 EQU OFFSET lit_len_mat_len_2b +l6ml2 EQU OFFSET lit_len_mat_len_2b_6 l6me2 EQU OFFSET lit_len_mat_ext_2b -l5ml1 EQU OFFSET lit_len_mat_len_1b + 1 +l5ml1 EQU OFFSET lit_len_mat_len_1b_45 l5me1 EQU OFFSET lit_len_mat_ext_1b + 1 -l5ml2 EQU OFFSET lit_len_mat_len_2b + 1 +l5ml2 EQU OFFSET lit_len_mat_len_2b_45 l5me2 EQU OFFSET lit_len_mat_ext_2b + 1 -l4ml1 EQU OFFSET lit_len_mat_len_1b + 2 +l4ml1 EQU OFFSET lit_len_mat_len_1b_45 + 1 l4me1 EQU OFFSET lit_len_mat_ext_1b + 2 -l4ml2 EQU OFFSET lit_len_mat_len_2b + 2 +l4ml2 EQU OFFSET lit_len_mat_len_2b_45 + 1 l4me2 EQU OFFSET lit_len_mat_ext_2b + 2 -l3ml1 EQU OFFSET lit_len_mat_len_1b + 3 +l3ml1 EQU OFFSET lit_len_mat_len_1b_23 l3me1 EQU OFFSET lit_len_mat_ext_1b + 3 -l3ml2 EQU OFFSET lit_len_mat_len_2b + 3 +l3ml2 EQU OFFSET lit_len_mat_len_2b_23 l3me2 EQU OFFSET lit_len_mat_ext_2b + 3 -l2ml1 EQU OFFSET lit_len_mat_len_1b + 4 +l2ml1 EQU OFFSET lit_len_mat_len_1b_23 + 1 l2me1 EQU OFFSET lit_len_mat_ext_1b + 4 -l2ml2 EQU OFFSET lit_len_mat_len_2b + 4 +l2ml2 EQU OFFSET lit_len_mat_len_2b_23 + 1 l2me2 EQU OFFSET lit_len_mat_ext_2b + 4 -l1ml1 EQU OFFSET lit_len_mat_len_1b + 5 +l1ml1 EQU OFFSET lit_len_mat_len_1b_01 l1me1 EQU OFFSET lit_len_mat_ext_1b + 5 -l1ml2 EQU OFFSET lit_len_mat_len_2b + 5 +l1ml2 EQU OFFSET lit_len_mat_len_2b_01 l1me2 EQU OFFSET lit_len_mat_ext_2b + 5 -l0ml1 EQU OFFSET lit_len_mat_len_1b + 6 ; MMMM handling comes after LLL code -l0me1 EQU OFFSET lit_len_mat_ext_1b + 6 ; MMMM handling comes after LLL code -l0ml2 EQU OFFSET lit_len_mat_len_2b + 6 ; MMMM handling comes after LLL code -l0me2 EQU OFFSET lit_len_mat_ext_2b + 6 ; MMMM handling comes after LLL code +l0ml1 EQU OFFSET lit_len_mat_len_1b_01 + 1 ; MMMM handling comes after LLL code +l0me1 EQU OFFSET lit_len_mat_ext_1b + 6 ; MMMM handling comes after LLL code +l0ml2 EQU OFFSET lit_len_mat_len_2b_01 + 1 ; MMMM handling comes after LLL code +l0me2 EQU OFFSET lit_len_mat_ext_2b + 6 ; MMMM handling comes after LLL code -; === Hand-written (!) jumptable actually begins here. -; Located before the program code results in an extra JMP and 3 wasted bytes, -; but it makes the code easier to follow in this location. -; Relocate the jump table after the ENDP directive to save 3 bytes. -; -; 7 6 5 4 3 2 1 0 -; O L L L M M M M -; ; 0 1 2 3 4 5 6 7 8 9 a b c d e f jtbl DW l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0ml1,l0me1 ;0 DW l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1ml1,l1me1 ;1 @@ -215,7 +217,7 @@ MACRO get_word_match_offset ENDM MACRO do_match_copy_long -LOCAL do_run, do_run_w +LOCAL even0,even1,even2,do_run,do_run_w ; Copies a long match as optimally as possible. ; requirements: cx=length, bp=negative offset, ds:si=compdata, es:di=output ; trashes: ax, bx @@ -226,45 +228,52 @@ LOCAL do_run, do_run_w xchg ax,si ;save si lea si,[bp+di] ;si = output buffer + negative match offset cmp bp,-2 ;do we have a byte/word run to optimize? - jae do_run ;perform a run if so, otherwise fall through -;You may be tempted to change "jae" to "jge" because DX is a signed number. -;Don't! The total window is 64k, so if you treat this as a signed comparison, -;you will get incorrect results for offsets over 32K. +IF OPTIMIZE_LONG_RLE + jae do_run ;catch offset = -2 or -1 +ELSE + ja do_run ;catch offset = -1 +ENDIF ;If we're here, we have a long copy and it isn't byte-overlapping (if it ;overlapped, we'd be in @@do_run) So, let's copy faster with REP MOVSW. ;This affects 8088 only slightly, but is a bigger win on 8086 and higher. shr cx,1 + jnc even0 + movsb +even0: rep movsw - adc cl,0 - rep movsb xchg si,ax ;restore si mov ds,bx ;restore ds jmp decode_token - do_run: +IF OPTIMIZE_LONG_RLE je do_run_w ;if applicable, handle word-sized value faster +ENDIF xchg dx,ax ;save si into dx, as ax is getting trashed lodsb ;load first byte of run into al mov ah,al shr cx,1 + jnc even1 + stosb +even1: rep stosw ;perform word run - adc cl,0 - rep stosb ;finish word run mov si,dx ;restore si mov ds,bx ;restore ds jmp decode_token +IF OPTIMIZE_LONG_RLE do_run_w: xchg dx,ax ;save si into dx, as ax is getting trashed lodsw ;load first word of run shr cx,1 rep stosw ;perform word run - adc cl,0 ;despite 2-byte offset, compressor might - rep stosb ;output odd length. better safe than sorry. + jnc even2 + stosb ;should be after rep stosw! +even2: mov si,dx ;restore si mov ds,bx ;restore ds jmp decode_token +ENDIF ENDM MACRO do_match_copy @@ -277,6 +286,9 @@ MACRO do_match_copy mov ds,ax ;ds=es xchg ax,si ;save si lea si,[bp+di] ;si = output buffer + negative match offset + movsb + movsb + movsb ;Handle MINMATCH (instead of add cx,MINMATCH) rep movsb xchg si,ax ;restore si mov ds,bx ;restore ds @@ -284,34 +296,36 @@ MACRO do_match_copy ENDM MACRO do_literal_copy +LOCAL even ; Copies a literal sequence using words. ; Meant for longer lengths; for 128 bytes or less, use REP MOVSB. ; requirements: cx=length, ds:si=compdata, es:di=output ; must leave cx=0 at exit shr cx,1 + jnc even + movsb +even: rep movsw - adc cl,0 - rep movsb ENDM MACRO copy_small_match_len and al,0FH ;isolate length in token (MMMM) - add al,minmatch ;ax=match length xchg cx,ax ;cx=match length do_match_copy ;copy match with cx=length, bp=offset ENDM MACRO copy_large_match_len -LOCAL val239, val238, EOD +LOCAL val239,val238,EOD ; Handle MMMM=Fh ; Assumptions: ah=0 from get_????_match_offset's xchg lodsb ;grab extra match length byte add al,0Fh+minmatch ;add MATCH_RUN_LEN + MIN_MATCH_SIZE - jz val238 ;if zf & cf, 238: get 16-bit match length +; jz val238 ;if zf & cf, 238: get 16-bit match length jc val239 ;if cf, 239: get extra match length byte xchg cx,ax ;otherwise, we have our match length do_match_copy_long ;copy match with cx=length, bp=offset val239: + jz val238 lodsb ;ah=0; grab single extra length byte inc ah ;ax=256+length byte xchg cx,ax @@ -347,16 +361,27 @@ decode_token: ; Path #1: LLL=0-6, MMMM=0-Eh, O=0 (1-byte match offset) ; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1) -lit_len_mat_len_1b: - movsb - movsb - movsb - movsb - movsb +lit_len_mat_len_1b_01: movsb get_byte_match_offset copy_small_match_len - +lit_len_mat_len_1b_23: + movsb + movsw + get_byte_match_offset + copy_small_match_len +lit_len_mat_len_1b_45: + movsb + movsw + movsw + get_byte_match_offset + copy_small_match_len +lit_len_mat_len_1b_6: + movsw + movsw + movsw + get_byte_match_offset + copy_small_match_len ; Path #2: LLL=0-6, MMMM=Fh, O=0 (1-byte match offset) lit_len_mat_ext_1b: @@ -375,13 +400,14 @@ lit_ext_mat_len_1b: ; on entry: ax=0 + token, bp=ax lodsb ;grab extra literal length byte add al,litrunlen ;add 7h literal run length - jz @@val249_3 ;if zf & cf, 249: get 16-bit literal length +; jz @@val249_3 ;if zf & cf, 249: get 16-bit literal length jc @@val250_3 ;if cf, 250: get extra literal length byte xchg cx,ax ;otherwise, we have our literal length do_literal_copy ;this might be better as rep movsw !!! benchmark get_byte_match_offset copy_small_match_len @@val250_3: +jz @@val249_3 lodsb ;ah=0; grab single extra length byte inc ah ;ax=256+length byte xchg cx,ax @@ -401,13 +427,14 @@ lit_ext_mat_ext_1b: ; on entry: ax=0 + token, bp=ax lodsb ;grab extra literal length byte add al,litrunlen ;add 7h literal run length - jz @@val249_4 ;if zf & cf, 249: get 16-bit literal length +; jz @@val249_4 ;if zf & cf, 249: get 16-bit literal length jc @@val250_4 ;if cf, 250: get extra literal length byte xchg cx,ax ;otherwise, we have our literal length do_literal_copy ;this might be better as rep movsw !!! benchmark get_byte_match_offset copy_large_match_len @@val250_4: +jz @@val249_4 lodsb ;ah=0; grab single extra length byte inc ah ;ax=256+length byte xchg cx,ax @@ -424,17 +451,30 @@ lit_ext_mat_ext_1b: ; Path #5: LLL=0-6, MMMM=0-Eh, O=1 (2-byte match offset) ; Handle LLL=0-6 by jumping directly into # of bytes to copy (6 down to 1) -lit_len_mat_len_2b: - movsb - movsb - movsb +lit_len_mat_len_2b_01: movsb + get_word_match_offset + copy_small_match_len +lit_len_mat_len_2b_23: movsb + movsw + get_word_match_offset + copy_small_match_len +lit_len_mat_len_2b_45: movsb + movsw + movsw + get_word_match_offset + copy_small_match_len +lit_len_mat_len_2b_6: + movsw + movsw + movsw get_word_match_offset copy_small_match_len +; Path #6: LLL=0-6, MMMM=Fh, O=1 (2-byte match offset) ; Path #6: LLL=0-6, MMMM=Fh, O=1 (2-byte match offset) lit_len_mat_ext_2b: movsb @@ -446,19 +486,19 @@ lit_len_mat_ext_2b: get_word_match_offset copy_large_match_len - ; Path #7: LLL=7, MMMM=0-Eh, O=1 (2-byte match offset) lit_ext_mat_len_2b: ; on entry: ax=0 + token, bp=ax lodsb ;grab extra literal length byte add al,litrunlen ;add 7h literal run length - jz @@val249_7 ;if zf & cf, 249: get 16-bit literal length +; jz @@val249_7 ;if zf & cf, 249: get 16-bit literal length jc @@val250_7 ;if cf, 250: get extra literal length byte xchg cx,ax ;otherwise, we have our literal length do_literal_copy ;this might be better as rep movsw !!! benchmark get_word_match_offset copy_small_match_len @@val250_7: +jz @@val249_7 lodsb ;ah=0; grab single extra length byte inc ah ;ax=256+length byte xchg cx,ax @@ -478,13 +518,14 @@ lit_ext_mat_ext_2b: ; on entry: ax=0 + token, bp=ax lodsb ;grab extra literal length byte add al,litrunlen ;add 7h literal run length - jz @@val249_8 ;if zf & cf, 249: get 16-bit literal length +; jz @@val249_8 ;if zf & cf, 249: get 16-bit literal length jc @@val250_8 ;if cf, 250: get extra literal length byte xchg cx,ax ;otherwise, we have our literal length do_literal_copy ;this might be better as rep movsw !!! benchmark get_word_match_offset copy_large_match_len @@val250_8: +jz @@val249_8 lodsb ;ah=0; grab single extra length byte inc ah ;ax=256+length byte xchg cx,ax @@ -512,6 +553,8 @@ ENDS CODE END + + ;Speed optimization history (decompression times in microseconds @ 4.77 MHz): ; defer add MIN_MATCH_SIZE shuttle 97207 alice 57200 robotron 362884 ++* ; jumptable rewrite, no RLE shuttle 97744 alice 46905 robotron 309032 -++ @@ -521,3 +564,18 @@ END ; long match copy #1 16-bit shuttle 92490 alice 46905 robotron 308722 +*+ ; long match copy #2 extraB shuttle 92464 alice 46905 robotron 308371 +.+ ; long match copy #3 0f->ed shuttle 86765 alice 46864 robotron 303895 +++! +; baseline new test harness shuttle 83925 alice 37948 robotron 269002 *** +; Pavel optimizations shuttle 82225 alice 36798 robotron 261226 +++ +; OPTIMIZE_LONG_RLE 1 shuttle 82242 alice 36787 robotron 261392 **- +; +;------ +; +;Pavel's optimization history: +; shuttle alice robotron time in 1.193 MHz timer clocks +;baseline 19109 D9A6 570F6 +;adc cl,0->adc cl,cl 19035 D9A6 56FAB +;rep movsb->shr cx,1;jnc 18FD4 D998 56F14 +;cmp bp,-2->inc bp;inc bp 18F07 D999 56EA3 +;jz;jc->jc 18D81 D973 56B2F +;add al,3->movsb x3 18B1E D777 56197 +;more lit_len_mat tables 18A83 D341 54ACC