apple2js/js/applesoft/decompiler.ts

import { byte, word, ReadonlyUint8Array, Memory } from '../types';
import { toHex } from 'js/util';
import { TOKEN_TO_STRING, STRING_TO_TOKEN } from './tokens';
import { TXTTAB, PRGEND } from './zeropage';

const LETTERS =
    '                                ' +
    ' !"#$%&\'()*+,-./0123456789:;<=>?' +
    '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_' +
    '`abcdefghijklmnopqrstuvwxyz{|}~ ';

/**
 * Resolves a token value to a token string or character.
 *
 * @param token
 * @returns string representing token
 */
const resolveToken = (token: byte) => {
    let tokenString;
    if (token >= 0x80 && token <= 0xea) {
        tokenString = TOKEN_TO_STRING[token];
    } else if (LETTERS[token] !== undefined) {
        tokenString = LETTERS[token];
    } else {
        tokenString = `[${toHex(token)}]`;
    }
    return tokenString;
};

interface ListOptions {
    apple2: 'e' | 'plus';
    columns: number; // usually 40 or 80
}

const DEFAULT_LIST_OPTIONS: ListOptions = {
    apple2: 'e',
    columns: 40,
};

interface DecompileOptions {
    style: 'compact' | 'pretty';
}

const DEFAULT_DECOMPILE_OPTIONS: DecompileOptions = {
    style: 'pretty',
};

const MAX_LINES = 32768;

export default class ApplesoftDecompiler {

    /**
     * Returns a decompiler for the program in the given memory.
     *
     * The memory is assumed to have set `TXTTAB` and `PRGEND` correctly.
     */
    static decompilerFromMemory(ram: Memory): ApplesoftDecompiler {
        const program: byte[] = [];

        const start = ram.read(0x00, TXTTAB) + (ram.read(0x00, TXTTAB + 1) << 8);
        const end = ram.read(0x00, PRGEND) + (ram.read(0x00, PRGEND + 1) << 8);
        if (start >= 0xc000 || end >= 0xc000) {
            throw new Error(`Program memory ${toHex(start, 4)}-${toHex(end, 4)} out of range`);
        }
        for (let addr = start; addr <= end; addr++) {
            program.push(ram.read(addr >> 8, addr & 0xff));
        }

        return new ApplesoftDecompiler(new Uint8Array(program), start);
    }

    /**
     * Constructs a decompiler for the given program data. The data is
     * assumed to be a dump of memory beginning at `base`. If the data
     * does not cover the whole program, attempting to decompile will
     * fail.
     *
     * @param program The program bytes.
     * @param base Address of start of program, or 0 to compute start
     */
    constructor(
        private readonly program: ReadonlyUint8Array,
        private readonly base: word = 0x801
    ) {
        if (this.base === 0) {
            // Signals that we're loading a file from disk, and
            // addresses are arbitrarily absolute, so we compute
            // base by taking the next line address and adjusting it to
            // the actual beginning of the next line.
            const nextLine = this.wordAt(0);
            // Start at beginning of first line
            let nextLineIndex = 4;
            // Find 0 at end of line
            while (program[nextLineIndex]) {
                nextLineIndex++;
            }
            // Move to beginning of next line
            nextLineIndex++;
            // Adjust base
            this.base = nextLine - nextLineIndex;
        }
    }

    /** Returns the 2-byte word at the given offset. */
    private wordAt(offset: word): word {
        return this.program[offset] + (this.program[offset + 1] << 8);
    }

    /**
     * Iterates through the lines of the given program in the order of
     * the linked list of lines, starting from the first line. This
     * does _not_ mean that all lines in memory will
     *
     * @param from First line for which to call the callback.
     * @param to Last line for which to call the callback.
     * @param callback A function to call for each line. The first parameter
     *     is the offset of the line number of the line; the tokens follow.
     */
    private forEachLine(
        from: number, to: number,
        callback: (offset: word) => void): void {
        let count = 0;
        let offset = 0;
        let nextLineAddr = this.wordAt(offset);
        let nextLineNo = this.wordAt(offset + 2);
        while (nextLineAddr !== 0 && nextLineNo < from) {
            if (++count > MAX_LINES) {
                throw new Error('Loop detected in listing');
            }
            offset = nextLineAddr;
            nextLineAddr = this.wordAt(offset);
            nextLineNo = this.wordAt(offset + 2);
        }
        while (nextLineAddr !== 0 && nextLineNo <= to) {
            if (++count > MAX_LINES) {
                throw new Error('Loop detected in listing');
            }
            callback(offset + 2);
            offset = nextLineAddr - this.base;
            nextLineAddr = this.wordAt(offset);
            nextLineNo = this.wordAt(offset + 2);
        }
    }

    /** Lists a single line like an Apple II. */
    listLine(offset: word, options: ListOptions): string {
        const lines: string[] = [];
        let line = '';

        const lineNo = this.wordAt(offset);
        // The Apple //e prints a space before each line number to make
        // it easier to edit the lines.  The change is at the subroutine
        // called at D6F9: on the //e it is SPCLIN (F7AA), on the ][+ it
        // is LINPRT (ED24).
        if (options.apple2 === 'e') {
            line += ' '; // D6F9: JSR SPCLIN
        }
        line += `${lineNo} `; // D6FC, always 1 space after line number
        offset += 2;

        // In the original ROM, the line length is checked immediately
        // after the line number is printed. For simplicity, this method
        // always assumes that there is space for one token—which would
        // have been the case on a realy Apple.
        while (this.program[offset] !== 0) {
            if (offset >= this.program.length) {
                lines.unshift('Unterminated line: ');
                break;
            }
            const token = this.program[offset];
            if (token >= 0x80 && token <= 0xea) {
                line += ' '; // D750, always put a space in front of token
                line += resolveToken(token);
                line += ' '; // D762, always put a trailing space
            } else {
                line += resolveToken(token);
            }
            offset++;

            // The Apple //e and ][+ differ in how they choose to break
            // long lines. In the ][+, D705 prints a newline if the
            // current column (MON_CH, $24) is greater than or equal to
            // 33. In the //e, control is passed to GETCH (F7B4), which
            // uses column 33 in 40-column mode and column 73 in 80-column
            // mode.
            //
            // The ][+ behaves more like a //e when there is an 80-column
            // card active (programs wrap at column 73).  From what I can
            // tell (using Virtual ]['s inspector), the 80-column card
            // keeps MON_CH at zero until the actual column is >= 71, when
            // it sets it to the actual cursor position - 40.  In the
            // Videx Videoterm ROM, this fixup happens in BASOUT (CBCD) at
            // CBE6 by getting the 80-column horizontal cursor position and
            // subtracting 0x47 (71). If the result is less than zero, then
            // 0x00 is stored in MON_CH, otherwise 31 is added back and the
            // result is stored in MON_CH. (The manual is archived at
            // http://www.apple-iigs.info/doc/fichiers/videoterm.pdf, among
            // other places.)
            //
            // For out purposes, we're just going to use the number of
            // columns - 7.
            if (line.length >= options.columns - 7) {
                line += '\n';
                lines.push(line);
                line = '     ';
            }
        }
        lines.push(line + '\n');
        return lines.join('');
    }

    /**
     * Lists the program in the same format that an Apple II prints to the
     * screen.
     *
     * This method also accepts a starting and ending line number. Like on
     * an Apple II, this will print all of the lines between `from` and `to`
     * (inclusive) regardless of the actual line numbers between them.
     *
     * To list a single line, pass the same number for both `from` and `to`.
     *
     * @param options The options for formatting the output.
     * @param from The first line to print (default 0).
     * @param to The last line to print (default end of program).
     */
    list(options: Partial<ListOptions> = {},
        from: number = 0, to: number = 65536): string {
        const allOptions = { ...DEFAULT_LIST_OPTIONS, ...options };

        let result = '';
        this.forEachLine(from, to, offset => {
            result += this.listLine(offset, allOptions);
        });
        return result;
    }

    /**
     * Returns a single line for the given compiled line in as little
     * space as possible.
     */
    compactLine(offset: word): string {
        let result = '';
        let spaceIf: (nextToken: string) => boolean = () => false;

        const lineNo = this.wordAt(offset);
        result += lineNo;
        spaceIf = (nextToken: string) => /^\d/.test(nextToken);
        offset += 2;

        while (this.program[offset] !== 0) {
            if (offset >= this.program.length) {
                return 'Unterminated line: ' + result;
            }
            const token = this.program[offset];
            let tokenString = resolveToken(token);
            if (tokenString === 'PRINT') {
                tokenString = '?';
            }
            if (spaceIf(tokenString)) {
                result += ' ';
            }

            result += tokenString;

            spaceIf = () => false;
            if (token === STRING_TO_TOKEN['AT']) {
                spaceIf = (nextToken) => nextToken.toUpperCase().startsWith('N');
            }

            offset++;
        }
        return result;
    }

    /**
     * Returns a single line for the compiled line, but with even spacing:
     *   * space after line number (not before)
     *   * space before and after colons (`:`)
     *   * space around equality and assignment operators (`=`, `<=`, etc.)
     *   * space after tokens, unless it looks like a function call
     *   * space after commas, but not before
     */
    prettyLine(offset: word): string {
        let result = '';
        let inString = false;
        let spaceIf: (char: byte) => boolean = () => false;

        const lineNo = this.wordAt(offset);
        result += `${lineNo} `;
        offset += 2;

        while (this.program[offset] !== 0) {
            if (offset >= this.program.length) {
                return 'Unterminated line: ' + result;
            }
            const token = this.program[offset];
            const tokenString = resolveToken(token);
            if (tokenString === '"') {
                inString = !inString;
            }

            if (spaceIf(token) || (!inString && tokenString === ':')) {
                result += ' ';
            }

            result += tokenString;

            if (!inString && tokenString === ':') {
                spaceIf = () => true;
            } else if (!inString && tokenString === ',') {
                spaceIf = () => true;
            } else if (token >= 0xcf && token <= 0xd1) {
                // For '<', '=', '>', don't add a space between them.
                spaceIf = (token: byte) => token < 0xcf || token > 0xd1;
            } else if (token > 0x80 && token < 0xea) {
                // By default, if a token is followed by an open paren, don't
                // add a space.
                spaceIf = (token: byte) => token !== 0x28;
            } else {
                // By default, if a literal is followed by a token, add a space.
                spaceIf = (token: byte) => token >= 0x80 && token <= 0xea;
            }

            offset++;
        }
        return result;
    }

    /**
     * Decompiles the program based on the given options.
     */
    decompile(options: Partial<DecompileOptions> = {},
        from: number = 0, to: number = 65536): string {
        const allOptions = { ...DEFAULT_DECOMPILE_OPTIONS, ...options };

        const results: string[] = [];
        this.forEachLine(from, to, offset => {
            results.push(allOptions.style === 'compact' ? this.compactLine(offset) : this.prettyLine(offset));
        });
        return results.join('\n');
    }
}