diff --git a/compiler/res/prog8lib/cx16/diskio.p8 b/compiler/res/prog8lib/cx16/diskio.p8
index dceec47b8..a3e9612c6 100644
--- a/compiler/res/prog8lib/cx16/diskio.p8
+++ b/compiler/res/prog8lib/cx16/diskio.p8
@@ -738,6 +738,24 @@ io_error:
         cbm.CLOSE(15)
     }
 
+    sub get_loadaddress(str filename) -> uword {
+        ; get the load adress from a PRG file (usually $0801 but it can be different)
+
+        cbm.SETNAM(strings.length(filename), filename)
+        cbm.SETLFS(READ_IO_CHANNEL, drivenumber, READ_IO_CHANNEL)
+        void cbm.OPEN()          ; open 12,8,12,"filename"
+        cx16.r0 = 0
+        if_cc {
+            cbm.CHKIN(READ_IO_CHANNEL)
+            cx16.r0L = cbm.CHRIN()
+            cx16.r0H = cbm.CHRIN()
+            if cbm.READST()!=0
+                cx16.r0 = 0
+        }
+        cbm.CLOSE(READ_IO_CHANNEL)
+        return cx16.r0
+    }
+
 
     ; CommanderX16 extensions over the basic C64/C128 diskio routines:
 
diff --git a/compiler/res/prog8lib/shared_cbm_diskio.p8 b/compiler/res/prog8lib/shared_cbm_diskio.p8
index ae35e86cc..4905ed432 100644
--- a/compiler/res/prog8lib/shared_cbm_diskio.p8
+++ b/compiler/res/prog8lib/shared_cbm_diskio.p8
@@ -651,4 +651,22 @@ io_error:
         cbm.CLRCHN()
         cbm.CLOSE(15)
     }
+
+    sub get_loadaddress(str filename) -> uword {
+        ; get the load adress from a PRG file (usually $0801 but it can be different)
+
+        cbm.SETNAM(strings.length(filename), filename)
+        cbm.SETLFS(READ_IO_CHANNEL, drivenumber, READ_IO_CHANNEL)
+        void cbm.OPEN()          ; open 12,8,12,"filename"
+        cx16.r0 = 0
+        if_cc {
+            cbm.CHKIN(READ_IO_CHANNEL)
+            cx16.r0L = cbm.CHRIN()
+            cx16.r0H = cbm.CHRIN()
+            if cbm.READST()!=0
+                cx16.r0 = 0
+        }
+        cbm.CLOSE(READ_IO_CHANNEL)
+        return cx16.r0
+    }
 }
diff --git a/compiler/res/prog8lib/shared_compression.p8 b/compiler/res/prog8lib/shared_compression.p8
index 90187f100..113dcb101 100644
--- a/compiler/res/prog8lib/shared_compression.p8
+++ b/compiler/res/prog8lib/shared_compression.p8
@@ -624,8 +624,6 @@ zx0_gamma_done: tax                             ; Preserve bit-buffer.
         ;
         ; NOTE: for speed reasons this decompressor is NOT bank-aware and NOT I/O register aware;
         ;       it only outputs to a memory buffer somewhere in the active 64 Kb address range
-        ;
-        ; TODO: include the in-place decompression version as well?
 
         %asm {{
 
@@ -859,6 +857,265 @@ lzput 	= cx16.r3	; 2 bytes
     }
 
 
+    asmsub decode_tscrunch_inplace(uword compressed @R0) clobbers(A,X,Y) {
+        ; Decompress a block of data compressed by TSCRUNCH *in place*
+        ; This can save an extra memory buffer if you are reading crunched data from a file into a buffer.
+        ; see https://github.com/tonysavon/TSCrunch
+        ; It has extremely fast decompression (approaching RLE speeds),
+        ; better compression as RLE, but slightly worse compression ration than LZSA
+        ;
+        ; NOTE: to allow in-place decompression you need to use -i switch when crunching.
+        ;       also, both the input data file and compressed data file are PRG files with a load header!
+        ; NOTE: for speed reasons this decompressor is NOT bank-aware and NOT I/O register aware;
+        ;       it only outputs to a memory buffer somewhere in the active 64 Kb address range
+        %asm {{
+
+; NMOS 6502 decompressor for data stored in TSCrunch format.
+;
+; Copyright Antonio Savona 2022.
+; Distributed under the Apache software License v2.0 https://www.apache.org/licenses/LICENSE-2.0
+;
+; Adapted for Prog8 and 6502 CMOS by Irmen de Jong.
+
+
+
+.if cx16.r0 < $100
+    ; r0-r15 registers are in zeropage just use those
+tsget 	= cx16.r0	; 2 bytes
+tsput 	= cx16.r1	; 2 bytes
+tstemp	= cx16.r2
+lzput 	= cx16.r3	; 2 bytes
+.else
+    .error "in decode_tscrunch: r0-15 are not in zeropage and no alternatives have been set up yet"     ; TODO
+.endif
+
+
+.if cx16.r0>=$100
+            ; set up the source and destination pointer
+            lda  cx16.r0L
+            sta  tsget
+            lda  cx16.r0H
+            sta  tsget+1
+.endif
+
+
+			ldy #$ff
+		-	iny
+			lda (tsget),y
+			sta tsput , y	; last iteration trashes lzput, with no effect.
+			cpy #3
+			bne -
+
+			pha
+
+			lda lzput
+			sta optRun + 1
+
+			tya
+			ldy #0
+			beq update_getonly
+
+	entry2:
+			; ILLEGAL lax (tsget),y
+			lda (tsget),y
+			tax
+
+			bmi rleorlz
+
+			cmp #$20
+			bcs lz2
+
+	; literal
+
+			inc tsget
+			beq updatelit_hi
+		return_from_updatelit:
+
+		ts_delit_loop:
+
+			lda (tsget),y
+			sta (tsput),y
+			iny
+			dex
+
+			bne ts_delit_loop
+
+			tya
+			tax
+			; carry is clear
+			ldy #0
+
+	updatezp_noclc:
+			adc tsput
+			sta tsput
+			bcs updateput_hi
+		putnoof:
+			txa
+		update_getonly:
+			adc tsget
+			sta tsget
+			bcc entry2
+			inc tsget+1
+			bcs entry2
+
+	updatelit_hi:
+			inc tsget+1
+			bcc return_from_updatelit
+	updateput_hi:
+			inc tsput+1
+			clc
+			bcc putnoof
+
+	rleorlz:
+
+			; ILLEGAL: alr #$7f
+			and #$7f
+			lsr a
+			bcc ts_delz
+
+		; RLE
+			beq optRun
+
+		plain:
+			ldx #2
+			iny
+			sta tstemp		; number of bytes to de-rle
+
+			lda (tsget),y	; fetch rle byte
+			ldy tstemp
+		runStart:
+			sta (tsput),y
+
+		ts_derle_loop:
+
+			dey
+			sta (tsput),y
+
+			bne ts_derle_loop
+
+			; update zero page with a = runlen, x = 2 , y = 0
+			lda tstemp
+
+			bcs updatezp_noclc
+
+	   done:
+	   		pla
+	   		sta (tsput),y
+			rts
+	; LZ2
+		lz2:
+			beq done
+
+			ora #$80
+			adc tsput
+			sta lzput
+			lda tsput + 1
+			sbc #$00
+			sta lzput + 1
+
+			; y already zero
+			lda (lzput),y
+			sta (tsput),y
+			iny
+			lda (lzput),y
+			sta (tsput),y
+
+			tya
+			dey
+
+			adc tsput
+			sta tsput
+			bcs lz2_put_hi
+		skp:
+			inc tsget
+			bne entry2
+			inc tsget + 1
+			bne entry2
+
+		lz2_put_hi:
+			inc tsput + 1
+			bcs skp
+
+	; LZ
+	ts_delz:
+
+			lsr a
+			sta lzto + 1
+
+			iny
+
+			lda tsput
+			bcc long
+
+			sbc (tsget),y
+			sta lzput
+			lda tsput+1
+
+			sbc #$00
+
+			ldx #2
+			; lz MUST decrunch forward
+	lz_put:
+			sta lzput+1
+
+			ldy #0
+
+			lda (lzput),y
+			sta (tsput),y
+
+			iny
+			lda (lzput),y
+			sta (tsput),y
+
+	ts_delz_loop:
+
+			iny
+
+			lda (lzput),y
+			sta (tsput),y
+
+	lzto:	cpy #0
+			bne ts_delz_loop
+
+			tya
+
+			; update zero page with a = runlen, x = 2, y = 0
+			ldy #0
+			; clc not needed as we have len - 1 in A (from the encoder) and C = 1
+
+			jmp updatezp_noclc
+
+	optRun:
+			ldy #255
+			sty tstemp
+
+			ldx #1
+			; A is zero
+
+			bne runStart
+
+	long:
+			; carry is clear and compensated for from the encoder
+			adc (tsget),y
+			sta lzput
+			iny
+			; ILLEGAL lax (tsget),y
+			lda (tsget),y
+			tax
+			ora #$80
+			adc tsput + 1
+
+			cpx #$80
+			rol lzto + 1
+			ldx #3
+
+			bne lz_put
+
+	        ; !notreached!
+        }}
+    }
+
+
 /***
     ; prog8 source code for the RLE routines above:
 
diff --git a/docs/source/libraries.rst b/docs/source/libraries.rst
index 2498e1c39..f9d4fb762 100644
--- a/docs/source/libraries.rst
+++ b/docs/source/libraries.rst
@@ -324,6 +324,30 @@ API is slightly experimental and may change in a future version.
     **NOTE:** for speed reasons this decompressor is NOT bank-aware and NOT I/O register aware;
     it only outputs to a memory buffer somewhere in the active 64 Kb address range.
 
+``decode_tscrunch_inplace (uword compressed)``
+    Decompress a block of data compressed in the TSCrunch format *inplace*.
+    This can save an extra memory buffer if you are reading crunched data from a file into a buffer.
+    It has extremely fast decompression (approaching RLE speeds),
+    better compression as RLE, but slightly worse compression ration than LZSA.
+    See https://github.com/tonysavon/TSCrunch for the compression format and compressor tool.
+    **NOTE:** for speed reasons this decompressor is NOT bank-aware and NOT I/O register aware;
+    it only outputs to a memory buffer somewhere in the active 64 Kb address range.
+
+    .. note::
+        The TSCrunch in-place format is a bit different than regular memory decompression.
+        It works with PRG files (so with a 2 byte load-address header) for both the *source* and *compressed* data files.
+        So if you want to compress and decompress a block of data from $a000-$c000 your source file has to start with
+        the bytes $00 $0a, then followed by the 8192 data byes, for a total of 8194 bytes.
+        Then you need to call the compressor program with the '-i' argument to tell it to create an in-place compressed data file.
+        The data file will *not* be loaded at $a000 but have its own load address closer to the end of the memory buffer.
+        If all is well, you can then load and decompress it like so::
+
+            uword tsi_start_addr = diskio.get_loadaddress("data8kb.tsi")
+            cx16.rambank(2)     ; or whatever ram bank you want on the X16
+            void diskio.load("data8kb.tsi", 0)      ; not load_raw!
+            cx16.rambank(2)     ; make sure the ram bank is still the same
+            compression.decode_tscrunch_inplace(tsi_start_addr)
+
 ``decode_zx0 (uword compressed, uword target)``
     Decompress a block of data compressed in the ZX0 format.
     This has faster decompression than LZSA, and a slightly better compression ratio as well.
diff --git a/docs/source/todo.rst b/docs/source/todo.rst
index 630196738..1feee5384 100644
--- a/docs/source/todo.rst
+++ b/docs/source/todo.rst
@@ -1,6 +1,8 @@
 TODO
 ====
 
+- diskio: if loading a hiram bank exactly fills the bank, then end adress is reset to $a000 still and the bank is increased by 1. that should probably not happen
+
 - DONE: make word arrays split by default and add new @nosplit tag to make an array use the old linear storage format
 - DONE: &splitarray  will give you the start address of the lsb-array (which is immediately followed by the msb-array)
 - DONE: add &< and &> operators to get the address of the lsb-array and msb-array, respectively.  (&< is just syntactic sugar for &)
@@ -74,7 +76,6 @@ IR/VM
 Libraries
 ---------
 - monogfx: flood fill should be able to fill stippled
-- Add in-place TSCrunch decoder routine as well to compression lib?  May come in handy where you load a block of compressed data, decompress it in place in the same buffer/memory bank
 - Sorting module gnomesort_uw could be optimized more, rewrite in asm? Shellshort seems consistently faster even if most of the words are already sorted.
 - Add split-word array sorting routines to sorting module?
 - pet32 target: make syslib more complete (missing kernal routines)?