diff --git a/README.md b/README.md index 7ec7140..c3527ee 100644 --- a/README.md +++ b/README.md @@ -18,19 +18,28 @@ join in--send patches, help add stuff, etc. ## The method +Documenting this for other texts to be converted in future... + +First we need to extract the text documents from the disks and turn them into +something we can use on a modern system: + 1. The DOS 3.3 disks were dumped using cppo -2. Apply the following transformations to each document file: +2. Apply `scripts/extract_piewriter.py` to each document file which did the + following transformations: * For characters 0xa0-0xfe, strip the high bit to get pure ASCII * Convert 0x0d and 0x8d (return) characters ti 0x0a (newline) * Escape all else in C-style -3. Remove NUL at end of .txt files -4. .pp dot command is paragraph break, replace with blank line. -5. Remove trailing whitespace -6. Normalize case and spacing of dot commands (lowercase here) +3. Remove NUL at end of .txt files and renamed the assembly source to .s +4. Remove trailing whitespace +6. Normalize dot commands (lowercase, spacing) for easier mechanical parsing. +7. Remove the obvious dot commands (.pp is a paragraph break, .sp creates + vertical space, .br seems to be a line break, .bp a page break) and attempt + to remove or interpret others as seems appropriate - -This has probably broken the .s files a bit, and I haven't bothered to decompile -the five byte HELLO ... ;) +This process has probably broken the .s files and there were some files that +don't appear to have actually been part of the text (or maybe they were edits +and revisions?), and there was bitrot in the files suggesting the disks the +source documents were stored on were losing their integrity. [dons-disks]: http://www.6502lane.net/2015/03/12/don-worths-beneath-apple-dos-original-text-files/ [archive.org]: https://archive.org/ diff --git a/scripts/extract_piewriter.py b/scripts/extract_piewriter.py new file mode 100755 index 0000000..31d88df --- /dev/null +++ b/scripts/extract_piewriter.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python3 + +"""extract_piewriter.py [...] + +Extracts PIEWriter documents extracted as raw "#064000" (binary blob) files +from Apple DOS 3.3 disks. Performs the following conversions: + + - Strips high bits from printable ASCII characters that have it set. + - Converts Mac-style CR-delimited lines to UNIX-style LF-delimited. + - Replaces any other character with its C-style escaped hex representation + (e.g., NUL is replaced with \\x00) + +The output is rough, but its enough to check it in to a git repository and +begin cleaning up now properly text files. +""" + +import sys + +if len(sys.argv) == 1: + print(sys.modules[__name__].__doc__) + sys.exit(1) + +for arg in sys.argv[1:]: + with open(arg, 'rb') as f: + infile = f.read() + + outfile = bytearray() + + for val in infile: + if 0xa0 <= val < 0xff: + outfile.append(val & 0x7f) + elif val in (0x0d, 0x8d, 0x8a): + outfile.append(0x0a) + else: + outfile.extend('\\x{:02x}'.format(val).encode('ASCII')) + + outname = ''.join((arg, '.txt')) + print('Saving', outname) + with open(outname, 'wb') as f: + f.write(outfile)