From 198ce1c4b4c900dfc7f4f4abcbeac18f8dc0c831 Mon Sep 17 00:00:00 2001 From: Dave Lyons Date: Wed, 19 Aug 2020 02:26:15 -0700 Subject: [PATCH] Add "index.py", which is similar to the Davex "index" command. It will construct the INDEXED.HELP file at build time. Work in progress. It has a few loose ends (see TODOs), and it doesn't do the traditional compression yet (5 bits %1xxxx for the 16 most popular characters, 8 bits for the other characters, 1..127). --- build/index.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100755 build/index.py diff --git a/build/index.py b/build/index.py new file mode 100755 index 0000000..8ff4c44 --- /dev/null +++ b/build/index.py @@ -0,0 +1,96 @@ +#!/usr/bin/python3 + +# [TODO] extract filename from pathname +# [TODO] check that we will not overflow the index into the text data +# [TODO] add the extra index entries if requested +# [TODO] what is the 4-byte length field in each entry? I don't think it's ever used, but traditionally the index command has set it to something. + +# index create indexed.help +# index add indexed.help [additional-alias-for-this-file] + +# Format of an indexed file is: +# +# +000 $00 DvxIdx $00 +# +008 Offset_to_text (4 bytes) +# +012 Offset_to_index (4 bytes) +# +016 Offset_to_free_index_space (4 bytes) +# +020 +# +052 start of index +# +# Index is: +# length-pfx'd string, offset(4), length(4) +# : +# $00 +# +# Text chunks are terminated by a $00. The data +# is compressed, with 16 common characters using +# only 5 bits (%1xxxx), and the remaining ones +# using 8 bits (%0xxxxxxx). + +import sys, struct + +verb = sys.argv[1] +indexFile = sys.argv[2] + +if verb == "create": + indexSize = int(sys.argv[3]) + print("We will create with size = " + str(indexSize)) + offsetToIndex = 52 + offsetOfText = offsetToIndex + indexSize + + outFile = open(indexFile, "wb") + outFile.write(bytes("\x00DvxIdx\x00", "utf8")) + outFile.write(struct.pack('i', offsetOfText)) + outFile.write(struct.pack('i', offsetToIndex)) + outFile.write(struct.pack('i', offsetToIndex)) # start of free space in index + for pad in range(8): + outFile.write(struct.pack('i', 0)) # 8 four-byte integers: reserved + + outFile.seek(offsetOfText - 1) + outFile.write(struct.pack('b', 0)) + exit(0) + +if verb == "add": + outFile = open(indexFile, "r+b") + addFilePath = sys.argv[3] + addFile = open (addFilePath, "r") + addLines = addFile.readlines() + addFile.close() + + # End-of-file offset is where we will put the new chunk of compressed text. + outFile.seek(0, 2) # end of file + offsetToNewText = outFile.tell() + + # Seek to free index space + outFile.seek(16) + data = outFile.read(4) + freeIndexSpace = int.from_bytes(data, "little") + print("freeIndexSpace = ", freeIndexSpace) + + # Write new index entry: length-byte, string name, 4-byte offset to start of text, 4-byte length of (compressed?) text + entryName = addFilePath # [TODO] extract just the filename portion + outFile.seek(freeIndexSpace) + outFile.write(struct.pack('b', len(entryName))) + outFile.write(bytes(entryName, "utf8")) + outFile.write(struct.pack('i', offsetToNewText)) + outFile.write(struct.pack('i', 0x77777777)) # [TODO] write the length + + # Optionally write additional index entries for the same text we are adding. + + # Update freeIndexSpace in file + freeIndexSpace = outFile.tell() + outFile.seek(16) + outFile.write(struct.pack('i', freeIndexSpace)) + + # Write the new text + outFile.seek(offsetToNewText) + for line in addLines: + for ch in line.rstrip(): + outFile.write(bytes(ch, "utf8")) + outFile.write(struct.pack('b', 13)) # carriage return + outFile.write(struct.pack('b', 0)) + outFile.close() + exit(0) + +print("Unknown verb: " + verb) +exit(1)