Adding support for LZW/2 scheme. Fixed both LZW/1 and LZW/2 to handle "multichunk" files (that is, files more than 4K long). Renamed the NuFX-specific LZW streams to be NufxLzw[12]InputStream. Tests need to be revamped heavily. Added some sample archives with both formats that have the same answer.

2024-12-22 02:31:00 +00:00 · 2008-06-25 03:22:17 +00:00 · 2008-06-25 03:22:17 +00:00 · ac9626e9c0
commit ac9626e9c0
parent 58af449378
9 changed files with 248 additions and 12 deletions
--- a/src/com/webcodepro/shrinkit/io/BitInputStream.java
+++ b/src/com/webcodepro/shrinkit/io/BitInputStream.java
@ -84,6 +84,15 @@ public class BitInputStream extends InputStream {
        data >>= requestedNumberOfBits; 
        bitsOfData-= requestedNumberOfBits; 
        return b; 
-    } 
+    }
+    
+    /**
+     * When shifting from buffer to buffer, the input stream also should be reset.
+     * This allows the "left over" bits to be cleared.
+     */
+    public void clearRemainingBitsOfData() {
+    	this.bitsOfData = 0;
+    	this.data = 0;
+    }
 } 

--- a/src/com/webcodepro/shrinkit/io/LzwInputStream.java
+++ b/src/com/webcodepro/shrinkit/io/LzwInputStream.java
@ -20,6 +20,7 @@ public class LzwInputStream extends InputStream {
 	private BitInputStream is;
 	private List<int[]> dictionary;
 	private Queue<Integer> outputBuffer = new ConcurrentLinkedQueue<Integer>();
+	private boolean newBuffer = true;
 	// See Wikipedia entry on LZW for variable naming
 	private int k;
 	private int[] w;
@ -57,11 +58,14 @@ public class LzwInputStream extends InputStream {
 			dictionary = new ArrayList<int[]>();
 			for (short i=0; i<256; i++) dictionary.add(new int[] { i });
 			dictionary.add(new int[] { 0x100 });	// 0x100 not used by NuFX
+		}
+		if (newBuffer) {
 			// Setup for decompression;
 			k = is.read();
 			outputBuffer.add(k);
 			if (k == -1) return; 
 			w = new int[] { k };
+			newBuffer = false;
 		}
 		// LZW decompression
 		k = is.read();
@ -102,5 +106,36 @@ public class LzwInputStream extends InputStream {
 	 */
 	public void clearDictionary() {
 		dictionary = null;
+		is.setRequestedNumberOfBits(9);
+		is.clearRemainingBitsOfData();
+		outputBuffer.clear();
+		k = 0;
+		w = null;
+		entry = null;
+		newBuffer = true;
+	}
+	
+//	/**
+//	 * Provide necessary housekeeping to reset LZW stream between NuFX buffer changes.
+//	 * The dictionary is the only item that is not cleared -- that needs to be done
+//	 * explicitly since behavior between LZW/1 and LZW/2 differ. 
+//	 */
+//	public void resetState() {
+//		is.clearRemainingBitsOfData();
+//		outputBuffer.clear();
+//		k = 0;
+//		w = null;
+//		entry = null;
+//		newBuffer = true;
+//	}
+	
+	/**
+	 * Provide necessary housekeeping to reset LZW stream between NuFX buffer changes.
+	 * The dictionary is the only item that is not cleared -- that needs to be done
+	 * explicitly since behavior between LZW/1 and LZW/2 differ. 
+	 */
+	public void clearData() {
+		is.clearRemainingBitsOfData();
+		outputBuffer.clear();
 	}
 }
--- a/src/com/webcodepro/shrinkit/io/NufxLzw1InputStream.java
+++ b/src/com/webcodepro/shrinkit/io/NufxLzw1InputStream.java
@ -6,7 +6,7 @@ import java.io.InputStream;
 import com.webcodepro.shrinkit.CRC16;

 /**
- * The <code>Lzw1InputStream</code> reads a data fork or
+ * The <code>NufxLzw1InputStream</code> reads a data fork or
 * resource fork written in the NuFX LZW/1 format.
 * <p>
 * The layout of the LZW/1 data is as follows:
@ -45,7 +45,7 @@ import com.webcodepro.shrinkit.CRC16;
 *  
 * @author robgreene@users.sourceforge.net
 */
-public class Lzw1InputStream extends InputStream {
+public class NufxLzw1InputStream extends InputStream {
 	/** This is the raw data stream with all markers and compressed data. */
 	private LittleEndianByteInputStream dataStream;
 	/** Used for an LZW-only <code>InputStream</code>. */
@ -70,7 +70,7 @@ public class Lzw1InputStream extends InputStream {
 	/**
 	 * Create the LZW/1 input stream.
 	 */
-	public Lzw1InputStream(LittleEndianByteInputStream dataStream) {
+	public NufxLzw1InputStream(LittleEndianByteInputStream dataStream) {
 		this.dataStream = dataStream;
 	}

@ -83,12 +83,13 @@ public class Lzw1InputStream extends InputStream {
 			volumeNumber = dataStream.readByte();
 			rleCharacter = dataStream.readByte();
 			lzwStream = new LzwInputStream(new BitInputStream(dataStream, 9));
-			rleStream = new RleInputStream(dataStream);
+			rleStream = new RleInputStream(dataStream, rleCharacter);
 			lzwRleStream = new RleInputStream(lzwStream);
 		}
 		if (bytesLeftInChunk == 0) {		// read the chunk header
 			bytesLeftInChunk = 4096;		// NuFX always reads 4096 bytes
 			lzwStream.clearDictionary();	// Always clear dictionary
+//			lzwStream.newBuffer();
 			int length = dataStream.readWord();
 			int lzwFlag = dataStream.readByte();
 			int flag = lzwFlag + (length == 4096 ? 0 : 2);
@ -106,6 +107,7 @@ public class Lzw1InputStream extends InputStream {
 		}
 		// Now we can read a data byte
 		int b = decompressionStream.read();
+		bytesLeftInChunk--;
 		dataCrc.update(b);
 		return b;
 	}
@ -137,10 +139,7 @@ public class Lzw1InputStream extends InputStream {
 	public void setRleCharacter(int rleCharacter) {
 		this.rleCharacter = rleCharacter;
 	}
-	public CRC16 getDataCrc() {
-		return dataCrc;
-	}
-	public void setDataCrc(CRC16 dataCrc) {
-		this.dataCrc = dataCrc;
+	public long getDataCrc() {
+		return dataCrc.getValue();
 	}
 }
--- a/src/com/webcodepro/shrinkit/io/NufxLzw2InputStream.java
+++ b/src/com/webcodepro/shrinkit/io/NufxLzw2InputStream.java
@ -0,0 +1,132 @@
+package com.webcodepro.shrinkit.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.webcodepro.shrinkit.CRC16;
+
+/**
+ * The <code>NufxLzw2InputStream</code> reads a data fork or
+ * resource fork written in the NuFX LZW/2 format.
+ * <p>
+ * The layout of the LZW/2 data is as follows:
+ * <table border="0">
+ * <tr>
+ *   <th colspan="3">"Fork" Header</th>
+ * </tr><tr>
+ *   <td>+0</td>
+ *   <td>Byte</td>
+ *   <td>Low-level volume number used to format 5.25" disks</td>
+ * </tr><tr>
+ *   <td>+1</td>
+ *   <td>Byte</td>
+ *   <td>RLE character used to decode this thread</td>
+ * </tr><tr>
+ *   <th colspan="3">Each subsequent 4K chunk of data</th>
+ * </tr><tr>
+ *   <td>+0</td>
+ *   <td>Word</td>
+ *   <td>Bits 0-12: Length after RLE compression<br/>
+ *       Bit 15: LZW flag (set to 1 if LZW used)</td>
+ * </tr><tr>
+ *   <td>+2</td>
+ *   <td>Word</td>
+ *   <td>If LZW flag = 1, total bytes in chunk<br/>
+ *       Else (flag = 0) start of data</td>
+ * </tr>
+ * <table>
+ * <p>
+ * The LZW/2 dictionary is only cleared when the table becomes full and is indicated
+ * in the input stream by 0x100.  It is also cleared whenever a chunk that is not
+ * LZW encoded is encountered.
+ *  
+ * @author robgreene@users.sourceforge.net
+ */
+public class NufxLzw2InputStream extends InputStream {
+	/** This is the raw data stream with all markers and compressed data. */
+	private LittleEndianByteInputStream dataStream;
+	/** Used for an LZW-only <code>InputStream</code>. */
+	private LzwInputStream lzwStream;
+	/** Used for an RLE-only <code>InputStream</code>. */
+	private RleInputStream rleStream;
+	/** Used for an LZW+RLE <code>InputStream</code>. */
+	private InputStream lzwRleStream;
+	/** This is the generic decompression stream from which we read. */
+	private InputStream decompressionStream;
+	/** Counts the number of bytes in the 4096 byte chunk. */
+	private int bytesLeftInChunk;
+	/** This is the volume number for 5.25" disks. */
+	private int volumeNumber = -1;
+	/** This is the RLE character to use. */
+	private int rleCharacter;
+	/** Used to track the CRC of data we've extracted */
+	private CRC16 dataCrc = new CRC16();
+	
+	/**
+	 * Create the LZW/2 input stream.
+	 */
+	public NufxLzw2InputStream(LittleEndianByteInputStream dataStream) {
+		this.dataStream = dataStream;
+	}
+
+	/**
+	 * Read the next byte in the decompressed data stream.
+	 */
+	public int read() throws IOException {
+		if (volumeNumber == -1) {				// read the data or resource fork header
+			volumeNumber = dataStream.readByte();
+			rleCharacter = dataStream.readByte();
+			lzwStream = new LzwInputStream(new BitInputStream(dataStream, 9));
+			rleStream = new RleInputStream(dataStream, rleCharacter);
+			lzwRleStream = new RleInputStream(lzwStream);
+		}
+		if (bytesLeftInChunk == 0) {		// read the chunk header
+			bytesLeftInChunk = 4096;		// NuFX always reads 4096 bytes
+//			lzwStream.newBuffer();			// Allow the LZW stream to do a little housekeeping
+			lzwStream.clearData();			// Allow the LZW stream to do a little housekeeping
+			int word = dataStream.readWord();
+			int length = word & 0x7fff;
+			int lzwFlag = word & 0x8000;
+			if (lzwFlag == 0) {				// We clear dictionary whenever a non-LZW chunk is encountered
+				lzwStream.clearDictionary();
+			} else {
+				dataStream.readWord();		// At this time, I just throw away the total bytes in this chunk...
+			}
+			int flag = (lzwFlag == 0 ? 0 : 1) + (length == 4096 ? 0 : 2);
+			switch (flag) {
+			case 0:		decompressionStream = dataStream;
+						break;
+			case 1:		decompressionStream = lzwStream;
+						break;
+			case 2:		decompressionStream = rleStream;
+						break;
+			case 3:		decompressionStream = lzwRleStream;
+						break;
+			default:	throw new IOException("Unknown type of decompression, flag = " + flag);
+			}
+		}
+		// Now we can read a data byte
+		int b = decompressionStream.read();
+		bytesLeftInChunk--;
+		dataCrc.update(b);
+		return b;
+	}
+	
+	// GENERATED CODE
+
+	public int getVolumeNumber() {
+		return volumeNumber;
+	}
+	public void setVolumeNumber(int volumeNumber) {
+		this.volumeNumber = volumeNumber;
+	}
+	public int getRleCharacter() {
+		return rleCharacter;
+	}
+	public void setRleCharacter(int rleCharacter) {
+		this.rleCharacter = rleCharacter;
+	}
+	public long getDataCrc() {
+		return dataCrc.getValue();
+	}
+}
--- a/test_src/com/webcodepro/shrinkit/io/APPLE.II-LZW1.SHK
+++ b/test_src/com/webcodepro/shrinkit/io/APPLE.II-LZW1.SHK
--- a/test_src/com/webcodepro/shrinkit/io/APPLE.II-LZW2.SHK
+++ b/test_src/com/webcodepro/shrinkit/io/APPLE.II-LZW2.SHK
--- a/test_src/com/webcodepro/shrinkit/io/APPLE.II.txt
+++ b/test_src/com/webcodepro/shrinkit/io/APPLE.II.txt
--- a/test_src/com/webcodepro/shrinkit/io/NufxLzw1Test.java
+++ b/test_src/com/webcodepro/shrinkit/io/NufxLzw1Test.java
@ -1,15 +1,21 @@
 package com.webcodepro.shrinkit.io;

 import java.io.IOException;
+import java.util.List;
+
+import com.webcodepro.shrinkit.HeaderBlock;
+import com.webcodepro.shrinkit.NuFileArchive;
+import com.webcodepro.shrinkit.ThreadKind;
+import com.webcodepro.shrinkit.ThreadRecord;

 /**
 * Test some LZW/1 format streams.
 * 
 * @author robgreene@users.sourceforge.net
 */
-public class Lzw1Test extends TestCaseHelper {
+public class NufxLzw1Test extends TestCaseHelper {
 	public void testTextFile() throws IOException {
-		Lzw1InputStream is = new Lzw1InputStream(new LittleEndianByteInputStream(getTextFileLzw1StreamData()));
+		NufxLzw1InputStream is = new NufxLzw1InputStream(new LittleEndianByteInputStream(getTextFileLzw1StreamData()));
 		byte[] expected = getTextFileData();
 		byte[] actual = new byte[expected.length];
 		is.read(actual);
@ -17,6 +23,27 @@ public class Lzw1Test extends TestCaseHelper {
 		assertTrue(is.isCrcValid());
 	}
 	
+	public void testAppleIIShk() throws IOException {
+		NuFileArchive archive = new NuFileArchive(getClass().getResourceAsStream("APPLE.II-LZW1.SHK"));
+		List<HeaderBlock> blocks = archive.getHeaderBlocks();
+		HeaderBlock block = blocks.get(0);	// only one file
+		if (block.getFilename() != null) System.out.printf("\n\n%s\n\n", block.getFilename());
+		List<ThreadRecord> records = block.getThreadRecords();
+		for (ThreadRecord record : records) {
+			if (record.getThreadKind() == ThreadKind.FILENAME) {
+				System.out.printf("\n\n%s\n\n", record.getText());
+			}
+			long bytes = record.getThreadEof();
+			if (record.getThreadKind() == ThreadKind.DATA_FORK) {
+				NufxLzw1InputStream is = new NufxLzw1InputStream(new LittleEndianByteInputStream(record.getRawInputStream()));
+				while ( bytes-- > 0 ) {
+					System.out.print((char)is.read());
+				}
+			}
+		}
+	}
+
+	
 	private byte[] getTextFileLzw1StreamData() {
 		return new byte[] {
 				(byte)0xCA, 0x42, 0x00, (byte)0xDB, (byte)0xB7, 0x00, 0x01, 0x54, 
--- a/test_src/com/webcodepro/shrinkit/io/NufxLzw2Test.java
+++ b/test_src/com/webcodepro/shrinkit/io/NufxLzw2Test.java
@ -0,0 +1,33 @@
+package com.webcodepro.shrinkit.io;
+
+import java.io.IOException;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import com.webcodepro.shrinkit.HeaderBlock;
+import com.webcodepro.shrinkit.NuFileArchive;
+import com.webcodepro.shrinkit.ThreadKind;
+import com.webcodepro.shrinkit.ThreadRecord;
+
+public class NufxLzw2Test extends TestCase {
+	public void testPascalFile() throws IOException {
+		NuFileArchive archive = new NuFileArchive(getClass().getResourceAsStream("APPLE.II-LZW2.SHK"));
+		List<HeaderBlock> blocks = archive.getHeaderBlocks();
+		HeaderBlock block = blocks.get(0);
+		if (block.getFilename() != null) System.out.printf("\n\n%s\n\n", block.getFilename());
+		List<ThreadRecord> records = block.getThreadRecords();
+		for (ThreadRecord record : records) {
+			if (record.getThreadKind() == ThreadKind.FILENAME) {
+				System.out.printf("\n\n%s\n\n", record.getText());
+			}
+			long bytes = record.getThreadEof();
+			if (record.getThreadKind() == ThreadKind.DATA_FORK) {
+				NufxLzw2InputStream is = new NufxLzw2InputStream(new LittleEndianByteInputStream(record.getRawInputStream()));
+				while ( bytes-- > 0 ) {
+					System.out.print((char)is.read());
+				}
+			}
+		}
+	}
+}