Update documentation.

2024-05-29 00:41:29 +00:00 · 2012-06-09 01:06:25 -07:00 · 2012-06-09 01:06:25 -07:00 · ffd96a8c2f
commit ffd96a8c2f
parent 07f807d680
23 changed files with 3264 additions and 276 deletions
--- a/doc/cmdref.sgm
+++ b/doc/cmdref.sgm
@ -302,10 +302,10 @@
         </para>
 <programlisting>
 .macro store16   ; `store16 dest, src
-	lda #&lt;_2
+        lda #&lt;_2
-	sta _1
+        sta _1
-	lda #&gt;_2
+        lda #&gt;_2
-	sta _1+1
+        sta _1+1
 .macend
 </programlisting>
         <para>
@ -361,91 +361,202 @@
         follow.
       </para>
       <itemizedlist>
-       <listitem><para><literal>.advance</literal> <emphasis>address</emphasis>:
+         <listitem>
-      Forces the program counter to
+           <para>
-      be <emphasis>address</emphasis>. Unlike
+             <literal>.outfile</literal> <emphasis>filename</emphasis>:
-      the <literal>.org</literal>
+             Sets the filename for the output binary if one has not
-      directive, <literal>.advance</literal> outputs zeroes until the
+             already been set. If no name is ever set, the output will
-      program counter reaches a specified address. Attempting
+             be written to <literal>ophis.bin</literal>.
-      to <literal>.advance</literal> to a point behind the current
+           </para>
-      program counter is an assemble-time error.</para></listitem>
+         </listitem>
-      <listitem><para><literal>.alias</literal> <emphasis>label</emphasis> <emphasis>value</emphasis>: The
+         <listitem>
-      .alias directive assigns an arbitrary value to a label. This
+           <para>
-      value may be an arbitrary argument, but cannot reference any
+             <literal>.advance</literal> <emphasis>address</emphasis>:
-      label that has not already been defined (this prevents
+             Forces the program counter to
-      recursive label dependencies).</para></listitem>
+             be <emphasis>address</emphasis>. Unlike
-      <listitem><para><literal>.byte</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ... ]:
+             the <literal>.org</literal>
-      Specifies a series of arguments, which are evaluated, and
+             directive, <literal>.advance</literal> outputs zeroes
-      strings, which are included as raw ASCII data. The final
+             until the program counter reaches a specified
-      results of these arguments must be one byte in size. Seperate
+             address. Attempting to <literal>.advance</literal> to a
-      constants are seperated by comments.</para></listitem>
+             point behind the current program counter is an
-      <listitem><para><literal>.checkpc</literal> <emphasis>address</emphasis>: Ensures that the
+             assemble-time error.
-      program counter is less than or equal to the address
+         </para>
-      specified, and emits an assemble-time error if it is not.
+         </listitem>
-      <emphasis>This produces no code in the final binary - it is there to
+         <listitem>
-      ensure that linking a large amount of data together does not
+           <para>
-      overstep memory boundaries.</emphasis></para></listitem>
+             <literal>.alias</literal> <emphasis>label</emphasis> <emphasis>value</emphasis>:
-      <listitem><para><literal>.data</literal> <emphasis>[label]</emphasis>: Sets the segment to
+             The .alias directive assigns an arbitrary value to a
-      the segment name specified and disallows output. If no label
+             label. This value may be an arbitrary argument, but
-      is given, switches to the default data segment.</para></listitem>
+             cannot reference any label that has not already been
-      <listitem><para><literal>.incbin</literal> <emphasis>filename</emphasis>: Inserts the
+             defined (this prevents recursive label
-      contents of the file specified as binary data. Use it to
+             dependencies).
-      include graphics information, precompiled code, or other
+         </para>
-      non-assembler data.</para></listitem>
+         </listitem>
-      <listitem><para><literal>.include</literal> <emphasis>filename</emphasis>: Includes the
+         <listitem>
-      entirety of the file specified at that point in the program.
+           <para>
-      Use this to order your final sources.</para></listitem>
+             <literal>.byte</literal> <emphasis>arg</emphasis> [
-      <listitem><para><literal>.org</literal> <emphasis>address</emphasis>: Sets the program
+             , <emphasis>arg</emphasis>, ... ]: Specifies a series of
-      counter to the address specified. <emphasis>This does not emit any
+             arguments, which are evaluated, and strings, which are
-      code in and of itself, nor does it overwrite anything that
+             included as raw ASCII data. The final results of these
-      previously existed.</emphasis> If you wish to jump ahead in memory,
+             arguments must be one byte in size. Seperate constants
-      use <literal>.advance</literal>.</para></listitem>
+             are seperated by comments.
-      <listitem><para><literal>.require</literal> <emphasis>filename</emphasis>: Includes the entirety
+           </para>
-      of the file specified at that point in the program.  Unlike <literal>.include</literal>,
+         </listitem>
-      however, code included with <literal>.require</literal> will only be inserted once.
+         <listitem>
-      The <literal>.require</literal> directive is useful for ensuring that certain code libraries
+           <para>
-      are somewhere in the final binary.  They are also very useful for guaranteeing that
+             <literal>.checkpc</literal> <emphasis>address</emphasis>:
-      macro libraries are available.</para></listitem>
+             Ensures that the program counter is less than or equal to
-      <listitem><para><literal>.space</literal> <emphasis>label</emphasis> <emphasis>size</emphasis>: This
+             the address specified, and emits an assemble-time error
-      directive is used to organize global variables. It defines the
+             if it is not. <emphasis>This produces no code in the
-      label specified to be at the current location of the program
+             final binary - it is there to ensure that linking a large
-      counter, and then advances the program counter <emphasis>size</emphasis>
+             amount of data together does not overstep memory
-      steps ahead. No actual code is produced. This is equivalent
+             boundaries.</emphasis>
-      to <literal>label: .org ^+size</literal>.</para></listitem>
+           </para>
-      <listitem><para><literal>.text</literal> <emphasis>[label]</emphasis>: Sets the segment to
+         </listitem>
-      the segment name specified and allows output. If no label is
+         <listitem>
-      given, switches to the default text segment.</para></listitem>
+           <para>
-      <listitem><para><literal>.word</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ... ]:
+             <literal>.data</literal> <emphasis>[label]</emphasis>:
-      Like <literal>.byte</literal>, but values are all treated as two-byte
+             Sets the segment to the segment name specified and
-      values and stored low-end first (as is the 6502's wont). Use
+             disallows output. If no label is given, switches to the
-      this to create jump tables (an unadorned label will evaluate
+             default data segment.
-      to that label's location) or otherwise store 16-bit
+           </para>
-      data.</para></listitem>
+         </listitem>
-      <listitem><para><literal>.dword</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ...]:
+         <listitem>
-      Like <literal>.word</literal>, but for 32-bit values.</para></listitem>
+           <para>
-      <listitem><para><literal>.wordbe</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ...]:
+             <literal>.incbin</literal> <emphasis>filename</emphasis>:
-      Like <literal>.word</literal>, but stores the value in a big-endian format (high byte first).</para></listitem>
+             Inserts the contents of the file specified as binary
-      <listitem><para><literal>.dwordbe</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ...]:
+             data. Use it to include graphics information, precompiled
-      Like <literal>.dword</literal>, but stores the value high byte first.</para></listitem>
+             code, or other non-assembler data.
-      <listitem><para><literal>.scope</literal>: Starts a new scope block. Labels
+           </para>
-      that begin with an underscore are only reachable from within
+         </listitem>
-      their innermost enclosing <literal>.scope</literal> statement.</para></listitem>
+         <listitem>
-      <listitem><para><literal>.scend</literal>: Ends a scope block. Makes the
+           <para>
-      temporary labels defined since the last <literal>.scope</literal>
+             <literal>.include</literal> <emphasis>filename</emphasis>:
-      statement unreachable, and permits them to be redefined in a
+             Includes the entirety of the file specified at that point
-      new scope.</para></listitem>
+             in the program.  Use this to order your final sources, if
-      <listitem><para><literal>.macro</literal> <emphasis>name</emphasis>: Begins a macro
+             you aren't doing it via the command line.
-      definition block. This is a scope block that can be inlined
+           </para>
-      at arbitrary points with <literal>.invoke</literal>. Arguments to the
+         </listitem>
-      macro will be bound to temporary labels with names like
+         <listitem>
-      <literal>_1</literal>, <literal>_2</literal>, etc.</para></listitem>
+           <para>
-      <listitem><para><literal>.macend</literal>: Ends a macro definition
+             <literal>.org</literal> <emphasis>address</emphasis>:
-      block.</para></listitem>
+             Sets the program counter to the address
-      <listitem><para><literal>.invoke</literal> <emphasis>label</emphasis> [<emphasis>argument</emphasis> [,
+             specified. <emphasis>This does not emit any code in and
-      <emphasis>argument</emphasis> ...]]: invokes (inlines) the specified
+             of itself, nor does it overwrite anything that previously
-      macro, binding the values of the arguments to the ones the
+             existed.</emphasis> If you wish to jump ahead in memory,
-      macro definition intends to read.  A shorthand for <literal>.invoke</literal>
+             use <literal>.advance</literal>.
-      is the name of the macro to invoke, backquoted.</para></listitem>
+           </para>
-         </itemizedlist>
+         </listitem>
         <listitem>
           <para>
             <literal>.require</literal> <emphasis>filename</emphasis>:
             Includes the entirety of the file specified at that point
             in the program.  Unlike <literal>.include</literal>,
             however, code included with <literal>.require</literal>
             will only be inserted once.
             The <literal>.require</literal> directive is useful for
             ensuring that certain code libraries are somewhere in the
             final binary.  They are also very useful for guaranteeing
             that macro libraries are available.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.space</literal> <emphasis>label</emphasis> <emphasis>size</emphasis>:
             This directive is used to organize global variables. It
             defines the label specified to be at the current location
             of the program counter, and then advances the program
             counter <emphasis>size</emphasis> steps ahead. No actual
             code is produced. This is equivalent to <literal>label:
             .org ^+size</literal>.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.text</literal> <emphasis>[label]</emphasis>:
             Sets the segment to the segment name specified and allows
             output. If no label is given, switches to the default
             text segment.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.word</literal> <emphasis>arg</emphasis> [
             , <emphasis>arg</emphasis>, ... ]:
             Like <literal>.byte</literal>, but values are all treated
             as two-byte values and stored low-end first (as is the
             6502's wont). Use this to create jump tables (an
             unadorned label will evaluate to that label's location)
             or otherwise store 16-bit data.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.dword</literal> <emphasis>arg</emphasis> [
             , <emphasis>arg</emphasis>, ...]:
             Like <literal>.word</literal>, but for 32-bit
             values.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.wordbe</literal> <emphasis>arg</emphasis> [
             , <emphasis>arg</emphasis>, ...]:
             Like <literal>.word</literal>, but stores the value in a
             big-endian format (high byte first).
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.dwordbe</literal> <emphasis>arg</emphasis> [
             , <emphasis>arg</emphasis>, ...]:
             Like <literal>.dword</literal>, but stores the value high
             byte first.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.scope</literal>: Starts a new scope
             block. Labels that begin with an underscore are only
             reachable from within their innermost
             enclosing <literal>.scope</literal>
             statement.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.scend</literal>: Ends a scope block. Makes the
             temporary labels defined since the
             last <literal>.scope</literal> statement unreachable, and
             permits them to be redefined in a new
             scope.
           </para>
         </listitem>
         <listitem>
           <para>
             <literal>.macro</literal> <emphasis>name</emphasis>:
             Begins a macro definition block. This is a scope block
             that can be inlined at arbitrary points
             with <literal>.invoke</literal>. Arguments to the macro
             will be bound to temporary labels with names like
             <literal>_1</literal>, <literal>_2</literal>, etc.
           </para>
         </listitem>
      <listitem>
        <para>
          <literal>.macend</literal>: Ends a macro definition block.
        </para>
      </listitem>
      <listitem>
        <para>
          <literal>.invoke</literal> <emphasis>label</emphasis> [<emphasis>argument</emphasis> [,
          <emphasis>argument</emphasis> ...]]: invokes (inlines) the
          specified macro, binding the values of the arguments to the
          ones the macro definition intends to read.  A shorthand
          for <literal>.invoke</literal> is the name of the macro to
          invoke, backquoted.
        </para>
      </listitem>
    </itemizedlist>
  </section>
 </appendix>
--- a/doc/hll1.sgm
+++ b/doc/hll1.sgm
@ -0,0 +1,185 @@
 <chapter id="hll-1">
 <title>The Second Step</title>
 <para>
  This essay discusses how to do 16-or-more bit addition and
  subtraction on the 6502, and how to do unsigned comparisons
  properly, thus making 16-bit arithmetic less necessary.
 </para>
 <section>
  <title>The problem</title>
  <para>
    The <literal>ADC</literal>, <literal>SBC</literal>, <literal>INX</literal>,
    and <literal>INY</literal> instructions are the only real
    arithmetic instructions the 6502 chip has.  In and of themselves,
    they aren't too useful for general applications: the accumulator
    can only hold 8 bits, and thus can't store any value over 255.
    Matters get even worse when we're branching based on
    values; <literal>BMI</literal> and <literal>BPL</literal> hinge on
    the seventh (sign) bit of the result, so we can't represent any
    value above 127.
  </para>
 </section>
 <section>
  <title>The solution</title>
  <para>
    We have two solutions available to us.  First, we can use
    the <quote>unsigned</quote> discipline, which involves checking
    different flags, but lets us deal with values between 0 and 255
    instead of -128 to 127.  Second, we can trade speed and register
    persistence for multiple precision arithmetic, using 16-bit
    integers (-32768 to 32767, or 0-65535), 24-bit, or more.
  </para>
  <para>
    Multiplication, division, and floating point arithmetic are beyond
    the scope of this essay.  The best way to deal with those is to
    find a math library on the web (I
    recommend <ulink url="http://www.6502.org/"></ulink>) and use the
    routines there.
  </para>
 </section>
 <section>
  <title>Unsigned arithmetic</title>
  <para>
    When writing control code that hinges on numbers, we should always
    strive to have our comparison be with zero; that way, no explicit
    compare is necessary, and we can branch simply
    with <literal>BEQ/BNE</literal>, which test the zero flag.
    Otherwise, we use <literal>CMP</literal>.
    The <literal>CMP</literal> command subtracts its argument from the
    accumulator (without borrow), updates the flags, but throws away
    the result.  If the value is equal, the result is zero.
    (<literal>CMP</literal> followed by <literal>BEQ</literal>
    branches if the argument is equal to the accumulator; this is
    probably why it's called <literal>BEQ</literal> and not something
    like <literal>BZS</literal>.)
  </para>
  <para>
    Intuitively, then, to check if the accumulator is <emphasis>less
    than</emphasis> some value, we <literal>CMP</literal> against that
    value and <literal>BMI</literal>.  The <literal>BMI</literal>
    command branches based on the Negative Flag, which is equal to the
    seventh bit of <literal>CMP</literal>'s subtract.  That's exactly
    what we need, for signed arithmetic.  However, this produces
    problems if you're writing a boundary detector on your screen or
    something and find that 192 &lt; 4.  192 is outside of a signed
    byte's range, and is interpreted as if it were -64.  This will not
    do for most graphics applications, where your values will be
    ranging from 0-319 or 0-199 or 0-255.
  </para>
  <para>
    Instead, we take advantage of the implied subtraction
    that <literal>CMP</literal> does.  When subtracting, the result's
    carry bit starts at 1, and gets borrowed from if necessary.  Let
    us consider some four-bit subtractions.
  </para>
 <programlisting>
 C|3210       C|3210
 ------       ------
 1|1001    9  1|1001    9
 |0100  - 4   |1100  -12
 ------  ---  ------  ---
 1|0101    5  0|1101   -3
 </programlisting>
  <para>
    The <literal>CMP</literal> command properly modifies the carry bit
    to reflect this.  When computing A-B, the carry bit is set if A
    &gt;= B, and it's clear if A &lt; B.  Consider the following two
    code sequences.
  </para>
 <programlisting>
    (1)                  (2)
  CMP #$C0            CMP #$C0
  BMI label           BCC label
 </programlisting>
  <para>
    The code in the first column treats the value in the accumulator
    as a signed value, and branches if the value is less than -64.
    (Because of overflow issues, it will actually branch for
    accumulator values between $40 and $BF, even though it *should*
    only be doing it for values between $80 and $BF.  To see why,
    compare $40 to $C0 and look at the result.)  The second column
    code treats the accumulator as holding an unsigned value, and
    branches if the value is less than 192.  It will branch for
    accumulator values $00-$BF.
  </para>
 </section>
 <section>
  <title>16-bit addition and subtraction</title>
  <para>
    Time to use the carry bit for what it was meant to do.  Adding two
    8 bit numbers can produce a 9-bit result.  That 9th bit is stored
    in the carry flag.  The <literal>ADC</literal> command adds the
    carry value to its result, as well.  Thus, carries work just as
    we'd expect them to.  Suppose we're storing two 16-bit values, low
    byte first, in $C100-1 and $C102-3.  To add them together and
    store them in $C104-5, this is very easy:
  </para>
 <programlisting>
  CLC
  LDA $C100
  ADC $C102
  STA $C104
  LDA $C101
  ADC $C103
  STA $C105
 </programlisting>
  <para>
    Subtraction is identical, but you set the carry bit first
    with <literal>SEC</literal> (because borrow is the complement of
    carry&mdash;think about how the unsigned compare works if this
    puzzles you) and, of course, using the <literal>SBC</literal>
    instruction instead of <literal>ADC</literal>.
  </para>
  <para>
    The carry/borrow bit is set appropriately to let you continue,
    too.  As long as you just keep working your way up to bytes of
    ever-higher significance, this generalizes to 24 (do it three
    times instead of two) or 32 (four, etc.) bit integers.
  </para>
 </section>
 <section>
  <title>16-bit comparisons</title>
  <para>
    Doing comparisons on extended precision values is about the same
    as doing them on 8-bit values, but you have to have the value you
    test in memory, since it won't fit in the accumulator all at once.
    You don't have to store the values back anywhere, either, since
    all you care about is the final state of the flags.  For example,
    here's a signed comparison, branching to <literal>label</literal>
    if the value in $C100-1 is less than 1000 ($03E8):
  </para>
 <programlisting>
  SEC
  LDA $C100
  SBC #$E8
  LDA $C101    ; We only need the carry bit from that subtract
  SBC #$03
  BMI label
 </programlisting>
  <para>
    All the commentary on signed and unsigned compares holds for
    16-bit (or higher) integers just as it does for the 8-bit
    ones.
  </para>
 </section>
 </chapter>
--- a/doc/hll2.sgm
+++ b/doc/hll2.sgm
@ -0,0 +1,880 @@
 <chapter id="hll2">
 <title>Structured Programming</title>
 <para>
  This essay discusses the machine language equivalents of the
  basic <quote>structured programming</quote> concepts that are part
  of the <quote>imperative</quote> family of programming languages:
  if/then/else, for/next, while loops, and procedures.  It also
  discusses basic use of variables, as well as arrays, multi-byte data
  types (records), and sub-byte data types (bitfields).  It closes by
  hand-compiling pseudo-code for an insertion sort on linked lists
  into assembler.  A complete Commodore 64 application is included as
  a sample with this essay.
 </para>
 <section>
  <title>Control constructs</title>
  <section>
    <title>Branches: <literal>if x then y else z</literal></title>
    <para>
      This is almost the most basic control construct.
      The <emphasis>most</emphasis> basic is <literal>if x then
      y</literal>, which is a simple branch instruction
      (bcc/bcs/beq/bmi/bne/bpl/bvc/bvs) past the <quote>then</quote>
      clause if the conditional is false:
    </para>
 <programlisting>
   iny
   bne no'overflow
   inx
 no'overflow:
   ;; rest of code
 </programlisting>
    <para>
      This increments the value of the y register, and if it just
      wrapped back around to zero, it increments the x register too.
      It is basically equivalent to the C statement <literal>if
      ((++y)==0) ++x;</literal>.  We need a few more labels to handle
      else clauses as well.
    </para>
 <programlisting>
   ;; Computation of the conditional expression.
   ;; We assume for the sake of the example that
   ;; we want to execute the THEN clause if the
   ;; zero bit is set, otherwise the ELSE
   ;; clause.  This will happen after a CMP,
   ;; which is the most common kind of 'if'
   ;; statement anyway.
   BNE else'clause
   ;; THEN clause code goes here.
   JMP end'of'if'stmt
 else'clause:
   ;; ELSE clause code goes here.
 end'of'if'stmt:
   ;; ... rest of code.
 </programlisting>
  </section>
  <section>
    <title>Free loops: <literal>while x do y</literal></title>
    <para>
      A <emphasis>free loop</emphasis> is one that might execute any
      number of times.  These are basically just a combination
      of <literal>if</literal> and <literal>goto</literal>.  For
      a <quote>while x do y</quote> loop, that executes zero or more
      times, you'd have code like this...
    </para>
 <programlisting>
 loop'begin:
   ;; ... computation of condition, setting zero
   ;;     bit if loop is finished...
   beq loop'done
   ;; ... loop body goes here
   jmp loop'begin
 loop'done:
   ;; ... rest of program.
 </programlisting>
    <para>
      If you want to ensure that the loop body executes at least once
      (do y while x), just move the test to the end.
    </para>
 <programlisting>
 loop'begin:
   ;; ... loop body goes here
   ;; ... computation of condition, setting zero
   ;;     bit if loop is finished...
   bne loop'begin
   ;; ... rest of program.
 </programlisting>
    <para>
      The choice of zero bit is kind of arbitrary here.  If the
      condition involves the carry bit, or overflow, or negative, then
      replace the beq with bcs/bvs/bmi appropriately.
    </para>
  </section>
  <section>
    <title>Bounded loops: <literal>for i = x to y do z</literal></title>
    <para>
      A special case of loops is one where you know exactly how many
      times you're going through it&mdash;this is called
      a <emphasis>bounded</emphasis> loop.  Suppose you're copying 16
      bytes from $C000 to $D000.  The C code for that would look
      something like this:
    </para>
 <programlisting>
   int *a = 0xC000;
   int *b = 0xD000;
   int i;
   for (i = 0; i < 16; i++) { a[i] = b[i]; }
 </programlisting>
    <para>
      C doesn't directly support bounded loops;
      its <literal>for</literal> statement is just <quote>syntactic
      sugar</quote> for a while statement.  However, we can take
      advantage of special purpose machine instructions to get very
      straightforward code:
    </para>
 <programlisting>
   ldx #$00
 loop:
   lda $c000, x
   sta $d000, x
   inx
   cpx #$10
   bmi loop
 </programlisting>
    <para>
      However, remember that every arithmetic operation,
      including <literal>inx</literal> and <literal>dex</literal>,
      sets the various flags, including the Zero bit.  That means that
      if we can make our computation <emphasis>end</emphasis> when the
      counter hits zero, we can shave off some bytes:
    </para>
 <programlisting>
   ldx #$10
 loop:
   lda #$bfff, x
   sta #$cfff, x
   dex
   bne loop
 </programlisting>
    <para>
      Notice that we had to change the addresses we're indexing from,
      because x takes a slightly different range of values.  The space
      savings is small here, and it's become slightly more unclear.
      (It also hasn't actually saved any time, because the lda and sta
      instructions are crossing a page boundary where they weren't
      before&mdash;but if the start or end arrays began at $b020 or
      something this wouldn't be an issue.)  This tends to work better
      when the precise value of the counter isn't used in the
      computation&mdash;so let us consider the NES, which uses memory
      location $2007 as a port to its video memory.  Suppose we wish
      to jam 4,096 copies of the hex value $20 into the video memory.
      We can write this <emphasis>very</emphasis> cleanly, using the X
      and Y registers as indices in a nested loop.
    </para>
 <programlisting>
   ldx #$10
   ldy #$00
   lda #$20
 loop:
   sta $2007
   iny
   bne loop
   dex
   bne loop
 </programlisting>
    <para>
      Work through this code.  Convince yourself that
      the <literal>sta</literal> is executed exactly 16*256 = 4096
      times.
    </para>
    <para>
      This is an example of a <emphasis>nested</emphasis> loop: a loop
      inside a loop.  Since our internal loop didn't need the X or Y
      registers, we got to use both of them, which is nice, because
      they have special incrementing and decrementing instructions.
      The accumulator lacks these instructions, so it is a poor choice
      to use for index variables.  If you have a bounded loop and
      don't have access to registers, use memory locations
      instead:
    </para>
 <programlisting>
   lda #$10
   sta counter  ; loop 16 times
 loop:
   ;; Do stuff that trashes all the registers
   dec counter
   bne loop
 </programlisting>
    <para>
      That's it!  These are the basic control constructs for using
      inside of procedures.  Before talking about how to organize
      procedures, I'll briefly cover the way the 6502 handles its
      stack, because stacks and procedures are very tightly
      intertwined.
    </para>
  </section>
 </section>
 <section>
  <title>The stack</title>
  <para>
    The 6502 has an onboard stack in page 1.  You can modify the stack
    pointer by storing values in X register and
    using <literal>txs</literal>; an <quote>empty</quote> stack is
    value $FF.  Going into a procedure pushes the address of the next
    instruction onto the stack, and RTS pops that value off and jumps
    there.  (Well, not precisely.  JSR actually pushes a value that's
    one instruction short, and RTS loads the value, increases it by
    one, and THEN jumps there.  But that's only an issue if you're
    using RTS to implement jump tables.)  On an interrupt, the next
    instruction's address is pushed on the stack, then the process
    flags, and it jumps to the handler.  The return from interrupt
    restores the flags and the PC, just as if nothing had
    happened.
  </para>
  <para>
    The stack only has 256 possible entries; since addresses take two
    bytes to store, that means that if you call something that calls
    something that calls something that (etc., etc., 129 times), your
    computation will fail.  This can happen faster if you save
    registers or memory values on the stack (see below).
  </para>
 </section>
 <section>
  <title>Procedures and register saving</title>
  <para>
    All programming languages are designed around the concept of
    procedures.<footnote><para>Yes, all of them. Functional languages
    just let you do more things with them, logic programming has
    implicit calls to query procedures, and
    object-oriented <quote>methods</quote> are just normal procedures
    that take one extra argument in secret.</para></footnote>
    Procedures let you break a computation up into different parts,
    then use them independently.  However, compilers do a lot of work
    for you behind the scenes to let you think this.  Consider the
    following assembler code.  How many times does the loop
    execute?
  </para>
 <programlisting>
 loop: ldx #$10 jsr do'stuff dex bne loop
 </programlisting>
  <para>
    The correct answer is <quote>I don't know, but
    it <emphasis>should</emphasis> be 16.</quote> The reason we don't
    know is because we're assuming here that
    the <literal>do'stuff</literal> routine doesn't change the value
    of the X register.  If it does, than all sorts of chaos could
    result.  For major routines that aren't called often but are
    called in places where the register state is important, you should
    store the old registers on the stack with code like this:
  </para>
 <programlisting>
 do'stuff:
   pha
   txa
   pha
   tya
   pha
   ;; Rest of do'stuff goes here
   pla
   tay
   pla
   tax
   pla
   rts
 </programlisting>
  <para>
    (Remember, the last item pushed onto the stack is the first one
    pulled off, so you have to restore them in reverse order.)  That's
    three more bytes on the stack, so you don't want to do this if you
    don't absolutely have to.  If <literal>do'stuff</literal>
    actually <emphasis>doesn't</emphasis> touch X, there's no need to
    save and restore the value.  This technique is
    called <emphasis>callee-save</emphasis>.
  </para>
  <para>
    The reverse technique is called <emphasis>caller-save</emphasis>
    and pushes important registers onto the stack before the routine
    is called, then restores them afterwards.  Each technique has its
    advantages and disadvantages.  The best way to handle it in your
    own code is to mark at the top of each routine which registers
    need to be saved by the caller.  (It's also useful to note things
    like how it takes arguments and how it returns values.)
  </para>
 </section>
 <section>
  <title>Variables</title>
  <para>
    Variables come in several flavors.
  </para>
  <section>
    <title>Global variables</title>
    <para>
      Global variables are variables that can be reached from any
      point in the program.  Since the 6502 has no memory protection,
      these are easy to declare.  Take some random chunk of unused
      memory and declare it to be the global variables area.  All
      reasonable assemblers have commands that let you give a symbolic
      name to a memory location&mdash;you can use this to give your
      globals names.
    </para>
  </section>
  <section>
    <title>Local variables</title>
    <para>
      All modern languages have some concept of <quote>local
      variables</quote>, which are data values unique to that
      invocation of that procedure.  In modern architecures, this data
      is stored into and read directly off of the stack.  The 6502
      doesn't really let you do this cleanly; I'll discuss ways of
      handling it in a later essay.  If you're implementing a system
      from scratch, you can design your memory model to not require
      such extreme measures.  There are three basic techniques.
    </para>
    <section>
      <title>Treat local variables like registers</title>
      <para>
        This means that any memory location you use, you save on the
        stack and restore afterwards.  This
        can <emphasis>really</emphasis> eat up stack space, and it's
        really slow, it's often pointless, and it has a tendency to
        overflow the stack.  I can't recommend it.  But it does let
        you do recursion right, if you don't need to save much memory
        and you aren't recursing very deep.
      </para>
    </section>
    <section>
      <title>Procedure-based memory allocation</title>
      <para>
        With this technique, you give each procedure its own little
        chunk of memory for use with its data.  All the variables are
        still, technically, globals; a
        routine <emphasis>could</emphasis> interfere with another's,
        but the discipline of <quote>only mess with real globals, and
        your own locals</quote> is very, very easy to maintain.
      </para>
      <para>
        This has many advantages.  It's <emphasis>very</emphasis>
        fast, both to write and to run, because loading a variable is
        an Absolute or Zero Page instruction.  Also, any procedure may
        call any other procedure, as long as it doesn't wind up
        calling itself at some point.
      </para>
      <para>
        It has two major disadvantages.  First, if many routines need
        a lot of space, it can consume more memory than it should.
        Also, this technique can require significant assembler
        support&mdash;you must ensure that no procedure's local
        variables are defined in the same place as any other
        procedure, and it essentially requires a full symbolic linker
        to do right.  Ophis includes commands for <emphasis>memory
        segmentation simulation</emphasis> that automate most of this
        task, and make writing general libraries feasible.
      </para>
    </section>
    <section>
      <title>Partition-based memory allocation</title>
      <para>
        It's not <emphasis>really</emphasis> necessary that no
        procedure overwrite memory used by any other procedure.  It's
        only required that procedures don't write on the memory that
        their <emphasis>callers</emphasis> use.  Suppose that your
        program is organized into a bunch of procedures, and each fall
        into one of three sets:
      </para>
      <itemizedlist>
        <listitem><para>Procedures in set A don't call anyone.</para></listitem>
        <listitem><para>Procedures in set B only call procedures in set A.</para></listitem>
        <listitem><para>Procedures in set C only call procedures in sets A or B.</para></listitem>
      </itemizedlist>
      <para>
        Now, each <emphasis>set</emphasis> can be given its own chunk
        of memory, and we can be absolutely sure that no procedures
        overwrite each other.  Even if every procedure in set C uses
        the <emphasis>same</emphasis> memory location, they'll never
        step on each other, because there's no way to get to any other
        routine in set C <emphasis>from</emphasis> any routine in set
        C.
      </para>
      <para>
        This has the same time efficiencies as procedure-based memory
        allocation, and, given a thoughtful design aimed at using this
        technique, also can use significantly less memory at run time.
        It's also requires much less assembler support, as addresses
        for variables may be assigned by hand without having to worry
        about those addresses already being used.  However, it does
        impose a very tight discipline on the design of the overall
        system, so you'll have to do a lot more work before you start
        actually writing code.
      </para>
    </section>
  </section>
  <section>
    <title>Constants</title>
    <para>
      Constants are <quote>variables</quote> that don't change.  If
      you know that the value you're using is not going to change, you
      should fold it into the code, either as an Immediate operand
      wherever it's used, or (if it's more complicated than that)
      as <literal>.byte</literal> commands in between the procedures.
      This is especially important for ROM-based systems such as the
      NES; the NES has very little RAM available, so constants should
      be kept in the more plentiful ROM wherever possible.
    </para>
  </section>
 </section>
 <section>
  <title>Data structures</title>
  <para>
    So far, we've been treating data as a bunch of one-byte values.
    There really isn't a lot you can do just with bytes.  This section
    talks about how to deal with larger and smaller elements.
  </para>
  <section>
    <title>Arrays</title>
    <para>
      An <emphasis>array</emphasis> is a bunch of data elements in a
      row.  An array of bytes is very easy to handle with the 6502
      chip, because the various indexed addressing modes handle it for
      you.  Just load the index into the X or Y register and do an
      absolute indexed load.  In general, these are going to be
      zero-indexed (that is, a 32-byte array is indexed from 0 to 31.)
      This code would initialize a byte array with 32 entries to
      0:
    </para>
 <programlisting>
   lda #$00
   tax
 loop:
   sta array,x
   inx
   cpx #$20
   bne loop
 </programlisting>
    <para>
      (If you count down to save instructions, remember to adjust the
      base address so that it's still writing the same memory
      location.)
    </para>
    <para>
      This approach to arrays has some limits.  Primary among them is
      that we can't have arrays of size larger than 256; we can't fit
      our index into the index register.  In order to address larger
      arrays, we need to use the indirect indexed addressing mode.  We
      use 16-bit addition to add the offset to the base pointer, then
      set the Y register to 0 and then load the value
      with <literal>lda (ptr),y</literal>.
    </para>
    <para>
      Well, actually, we can do better than that.  Suppose we want to
      clear out 8K of ram, from $2000 to $4000.  We can use the Y
      register to hold the low byte of our offset, and only update the
      high bit when necessary.  That produces the following
      loop:
    </para>
 <programlisting>
   lda #$00  ; Set pointer value to base ($2000)
   sta ptr
   lda #$20
   sta ptr+1
   lda #$00  ; Storing a zero
   ldx #$20  ; 8,192 ($2000) iterations: high byte
   ldy #$00  ; low byte.
 loop:
   sta (ptr),y
   iny
   bne loop  ; If we haven't wrapped around, go back
   inc ptr+1 ; Otherwise update high byte
   dex       ; bump counter
   bne loop  ; and continue if we aren't done
 </programlisting>
    <para>
      This code could be optimized further; the loop prelude in
      particular loads a lot of redundant values that could be
      compressed down further:
    </para>
 <programlisting>
   lda #$00
   tay
   ldx #$20
   sta ptr
   stx ptr+1
 </programlisting>
    <para>
      That's not directly relevant to arrays, but these sorts of
      things are good things to keep in mind when writing your code.
      Done well, they can make it much smaller and faster; done
      carelessly, they can force a lot of bizarre dependencies on your
      code and make it impossible to modify later.
    </para>
  </section>
  <section>
    <title>Records</title>
    <para>
      A <emphasis>record</emphasis> is a collection of values all
      referred to as one variable.  This has no immediate
      representation in assembler.  If you have a global variable
      that's two bytes and a code pointer, this is exactly equivalent
      to three seperate variables.  You can just put one label in
      front of it, and refer to the first byte
      as <literal>label</literal>, the second
      as <literal>label+1</literal>, and the code pointer
      a <literal>label+2</literal>.
    </para>
    <para>
      This really applies to all data structures that take up more
      than one byte.  When dealing with the pointer, a 16-bit value,
      we refer to the low byte as <literal>ptr</literal>
      (or <literal>label+2</literal>, in the example above), and the
      high byte as <literal>ptr+1</literal>
      (or <literal>label+3</literal>).
    </para>
    <para>
      Arrays of records are more interesting.  There are two
      possibilities for these.  The way most high level languages
      treat it is by keeping the records contiguous.  If you have an
      array of two sixteen bit integers, then the records are stored
      in order, one at a time.  The first is in location $1000, the
      next in $1004, the next in $1008, and so on.  You can do this
      with the 6502, but you'll probably have to use the indirect
      indexed mode if you want to be able to iterate
      conveniently.
    </para>
    <para>
      Another, more unusual, but more efficient approach is to keep
      each byte as a seperate array, just like in the arrays example
      above.  To illustrate, here's a little bit of code to go through
      a contiguous array of 16 bit integers, adding their values to
      some <literal>total</literal> variable:
    </para>
 <programlisting>
   ldx #$10  ; Number of elements in the array
   ldy #$00  ; Byte index from array start
 loop:
   clc
   lda array, y      ; Low byte
   adc total
   sta total
   lda array+1, y    ; High byte
   adc total+1
   sta total+1
   iny               ; Jump ahead to next entry
   iny
   dex               ; Check for loop termination
   bne loop
 </programlisting>
    <para>
      And here's the same loop, keeping the high and low bytes in
      seperate arrays:
    </para>
 <programlisting>
   ldx #$00
 loop:
   clc
   lda lowbyte,x
   adc total
   sta total
   lda highbyte,x
   adc total+1
   sta total+1
   inx
   cpx #$10
   bne loop
 </programlisting>
    <para>
      Which approach is the right one depends on what you're doing.
      For large arrays, the first approach is better, as you only need
      to maintain one base pointer.  For smaller arrays, the easier
      indexing makes the second approach more convenient.
    </para>
  </section>
  <section>
    <title>Bitfields</title>
    <para>
      To store values that are smaller than a byte, you can save space
      by putting multiple values in a byte.  To extract a sub-byte
      value, use the bitmasking commands:
    </para>
    <itemizedlist>
      <listitem><para>To set bits, use the <literal>ORA</literal> command.  <literal>ORA #$0F</literal> sets the lower four bits to 1 and leaves the rest unchanged.</para></listitem>
      <listitem><para>To clear bits, use the <literal>AND</literal> command.  <literal>AND #$F0</literal> sets the lower four bits to 0 and leaves the rest unchanged.</para></listitem>
      <listitem><para>To reverse bits, use the <literal>EOR</literal> command.  <literal>EOR #$0F</literal> reverses the lower four bits and leaves the rest unchanged.</para></listitem>
      <listitem><para>To test if a bit is 0, AND away everything but that bit, then see if the Zero bit was set.  If the bit is in the top two bits of a memory location, you can use the BIT command instead (which stores bit 7 in the Negative bit, and bit 6 in the Overflow bit).</para></listitem>
    </itemizedlist>
  </section>
 </section>
 <section>
  <title>A modest example: Insertion sort on linked lists</title>
  <para>
    To demonstrate these techniques, we will now produce code to
    perform insertion sort on a linked list.  We'll start by defining
    our data structure, then defining the routines we want to write,
    then producing actual code for those routines.  A downloadable
    version that will run unmodified on a Commodore 64 closes the
    chapter.
  </para>
  <section>
    <title>The data structure</title>
    <para>
      We don't really want to have to deal with pointers if we can
      possibly avoid it, but it's hard to do a linked list without
      them.  Instead of pointers, we will
      use <emphasis>cursors</emphasis>: small integers that represent
      the index into the array of values.  This lets us use the
      many-small-byte-arrays technique for our data.  Furthermore, our
      random data that we're sorting never has to move, so we may
      declare it as a constant and only bother with changing the
      values of <literal>head</literal> and
      the <literal>next</literal> arrays.  The data record definition
      looks like this:
    </para>
 <programlisting>
  head : byte;
  data : const int[16] = [838, 618, 205, 984, 724, 301, 249, 946,
                          925,  43, 114, 697, 985, 633, 312,  86];
  next : byte[16];
 </programlisting>
    <para>
      Exactly how this gets represented will vary from assembler to
      assembler.  Ophis does it like this:
    </para>
 <programlisting>
 .data
 .space head 1
 .space next 16
 .text
 lb:   .byte &lt;$838,&lt;$618,&lt;$205,&lt;$984,&lt;$724,&lt;$301,&lt;$249,&lt;$946
      .byte &lt;$925,&lt;$043,&lt;$114,&lt;$697,&lt;$985,&lt;$633,&lt;$312,&lt;$086
 hb:   .byte >$838,>$618,>$205,>$984,>$724,>$301,>$249,>$946
      .byte >$925,>$043,>$114,>$697,>$985,>$633,>$312,>$086
 </programlisting>
  </section>
  <section>
    <title>Doing an insertion sort</title>
    <para>
      To do an insertion sort, we clear the list by setting the 'head'
      value to -1, and then insert each element into the list one at a
      time, placing each element in its proper order in the list.  We
      can consider the lb/hb structure alone as an array of 16
      integers, and just insert each one into the list one at a
      time.
    </para>
 <programlisting>
 procedure insertion_sort
  head := -1;
  for i := 0 to 15 do
    insert_elt i
  end
 end
 </programlisting>
    <para>
      This translates pretty directly.  We'll have insert_elt take its
      argument in the X register, and loop with that.  However, given
      that insert_elt is going to be a complex procedure, we'll save
      the value first.  The assembler code becomes:
    </para>
 <programlisting>
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; insertion'sort:  Sorts the list defined by head, next, hb, lb.
 ; Arguments:  None.
 ; Modifies:   All registers destroyed, head and next array sorted.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 insertion'sort:
        lda #$FF        ; Clear list by storing the terminator in 'head'
        sta head
        ldx #$0         ; Loop through the lb/hb array, adding each
 insertion'sort'loop:    ; element one at a time
        txa
        pha
        jsr insert_elt
        pla
        tax
        inx
        cpx #$10
        bne insertion'sort'loop
        rts
 </programlisting>
  </section>
  <section>
    <title>Inserting an element</title>
    <para>
      The pseudocode for inserting an element is a bit more
      complicated.  If the list is empty, or the value we're inserting
      goes at the front, then we have to update the value
      of <literal>head</literal>.  Otherwise, we can iterate through
      the list until we find the element that our value fits in after
      (so, the first element whose successor is larger than our
      value).  Then we update the next pointers directly and exit.
    </para>
 <programlisting>
 procedure insert_elt i
 begin
   if head = -1 then begin
      head := i;
      next[i] := -1;
      return;
   end;
   val := data[i];
   if val < data[i] then begin
      next[i] := head;
      head := i;
      return;
   end;
   current := head;
   while (next[current] &lt;&gt; -1 and val &lt; data[next[current]]) do
      current := next[current];
   end;
   next[i] := next[current];
   next[current] := i;
 end;
 </programlisting>
    <para>
      This produces the following rather hefty chunk of code:
    </para>
 <programlisting>
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; insert_elt: Insert an element into the linked list.  Maintains the
 ;             list in sorted, ascending order.  Used by
 ;             insertion'sort.
 ; Arguments:  X register holds the index of the element to add.
 ; Modifies:   All registers destroyed; head and next arrays updated
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data
 .space lbtoinsert 1
 .space hbtoinsert 1
 .space indextoinsert 1
 .text
 insert_elt:
        ldy head                        ; If the list is empty, make
        cpy #$FF                        ; head point at it, and return.
        bne insert_elt'list'not'empty
        stx head
        tya
        sta next,x
        rts
 insert_elt'list'not'empty:
        lda lb,x                        ; Cache the data we're inserting
        sta lbtoinsert
        lda hb,x
        sta hbtoinsert
        stx indextoinsert
        ldy head                        ; Compare the first value with
        sec                             ; the data.  If the data must
        lda lb,y                        ; be inserted at the front...
        sbc lbtoinsert
        lda hb,y
        sbc hbtoinsert
        bmi insert_elt'not'smallest
        tya                             ; Set its next pointer to the
        sta next,x                      ; old head, update the head
        stx head                        ; pointer, and return.
        rts
 insert_elt'not'smallest:
        ldx head
 insert_elt'loop:                        ; At this point, we know that
        lda next,x                      ; argument > data[X].
        tay
        cpy #$FF                        ; if next[X] = #$FF, insert arg at end.
        beq insert_elt'insert'after'current
        lda lb,y                        ; Otherwise, compare arg to
        sec                             ; data[next[X]].  If we insert
        sbc lbtoinsert                  ; before that...
        lda hb,y
        sbc hbtoinsert
        bmi insert_elt'goto'next
 insert_elt'insert'after'current:        ; Fix up all the next links
        tya
        ldy indextoinsert
        sta next,y
        tya
        sta next,x
        rts                             ; and return.
 insert_elt'goto'next:                   ; Otherwise, let X = next[X]
        tya                             ; and go looping again.
        tax
        jmp insert_elt'loop
 </programlisting>
  </section>
  <section>
    <title>The complete application</title>
    <para>
      The full application, which deals with interfacing with CBM
      BASIC and handles console I/O and such, is
      in <xref linkend="structure-src" endterm="structure-fname">.
    </para>
  </section>
 </section>
 </chapter>
--- a/doc/hll3.sgm
+++ b/doc/hll3.sgm
@ -0,0 +1,297 @@
 <chapter id="hll3">
 <title>Pointers and Indirection</title>
 <para>
  The basics of pointers versus cursors (or, at the 6502 assembler
  level, the indirect indexed addressing mode versus the absolute
  indexed ones) were covered in <xref linkend="hll2"> This essay seeks
  to explain the uses of the indirect modes, and how to implement
  pointer operations with them.  It does <emphasis>not</emphasis> seek to explain
  why you'd want to use pointers for something to begin with; for a
  tutorial on proper pointer usage, consult any decent C textbook.
 </para>
 <section>
  <title>The absolute basics</title>
  <para>
    A pointer is a variable holding the address of a memory location.
    Memory locations take 16 bits to represent on the 6502: thus, we
    need two bytes to hold it.  Any decent assembler will have ways of
    taking the high and low bytes of an address; use these to acquire
    the raw values you need.  The 6502 chip does not have any
    simple <quote>pure</quote> indirect modes (except
    for <literal>JMP</literal>, which is a matter for a later essay);
    all are indexed, and they're indexed different ways depending on
    which index register you use.
  </para>
  <section>
    <title>The simplest example</title>
    <para>
      When doing a simple, direct dereference (that is, something
      equivalent to the C code <literal>c=*b;</literal>) the code
      looks like this:
    </para>
 <programlisting>
        ldy #0
        lda (b), y
        sta c
 </programlisting>
    <para>
      Even with this simple example, there are several important
      things to notice.
    </para>
    <itemizedlist>
      <listitem>
        <para>
          The variable <literal>b</literal> <emphasis>must be on the
            zero page</emphasis>, and furthermore, it <emphasis>cannot
            be $FF.</emphasis> All your pointer values need to be
            either stored on the zero page to begin with or copied
            there before use.
        </para>
      </listitem>
      <listitem>
        <para>
          The <literal>y</literal> in the <literal>lda</literal>
          statement must be y.  It cannot be x (that's a different
          form of indirection), and it cannot be a constant.  If
          you're doing a lot of indirection, be sure to keep your Y
          register free to handle the indexing on the
          pointers.
      </para>
      </listitem>
      <listitem>
        <para>
          The <literal>b</literal> variable is used alone.  Statements
          like <literal>lda (b+2), y</literal> are syntactically valid
          and sometimes even correct: it dereferences the value next
          to <literal>b</literal> after adding y to the value therein.
          However, it is almost guaranteed that what you *really*
          wanted to do was compute <literal>*(b+2)</literal> (that is,
          take the address of b, add 2 to <emphasis>that</emphasis>,
          and dereference that value); see the next section for how to
          do this properly.
        </para>
      </listitem>
    </itemizedlist>
    <para>
      In nearly all cases, it is the Y-register's version (Indirect
      Indexed) that you want to use when you're dealing with pointers.
      Even though either version could be used for this example, we
      use the Y register to establish this habit.
    </para>
  </section>
 </section>
 <section>
  <title>Pointer arithmetic</title>
  <para>
    Pointer arithmetic is an obscenely powerful and dangerous
    technique.  However, it's the most straightforward way to deal
    with enormous arrays, structs, indexable stacks, and nearly
    everything you do in C.  (C has no native array or string types
    primarily because it allows arbitrary pointer arithmetic, which is
    strong enough to handle all of those without complaint and at
    blazing speed.  It also allows for all kinds of buffer overrun
    security holes, but let's face it, who's going to be cracking root
    on your Apple II?)  There are a number of ways to implement this
    on the 6502.  We'll deal with them in increasing order of design
    complexity.
  </para>
  <section>
    <title>The straightforward, slow way</title>
    <para>
      When computing a pointer value, you simply treat the pointer as
      if it were a 16-bit integer.  Do all the math you need, then
      when the time comes to dereference it, simply do a direct
      dereference as above.  This is definitely doable, and it's not
      difficult.  However, it is costly in both space and time.
    </para>
    <para>
      When dealing with arbitrary indices large enough that they won't
      fit in the Y register, or when creating values that you don't
      intend to dereference (such as subtracting two pointers to find
      the length of a string), this is also the only truly usable
      technique.
    </para>
  </section>
  <section>
    <title>The clever fast way</title>
    <para>
      But wait, you say.  Often when we compute a value, at least one
      of the operations is going to be an addition, and we're almost
      certain to have that value be less than 256!  Surely we may save
      ourselves an operation by loading that value into the Y register
      and having the load operation itself perform the final
      addition!
    </para>
    <para>
      Very good.  This is the fastest technique, and sometimes it's
      even the most readable.  These cases usually involve repeated
      reading of various fields from a structure or record.  The base
      pointer always points to the base of the structure (or the top
      of the local variable list, or what have you) and the Y register
      takes values that index into that structure.  This lets you keep
      the pointer variable in memory largely static and requires no
      explicit arithmetic instructions at all.
    </para>
    <para>
      However, this technique is highly opaque and should always be
      well documented, indicating exactly what you think you're
      pointing at.  Then, when you get garbage results, you can
      compare your comments and the resulting Y values with the actual
      definition of the structure to see who's screwing up.
    </para>
    <para>
      For a case where we still need to do arithmetic, consider the
      classic case of needing to clear out a large chunk of memory.
      The following code fills the 4KB of memory between $C000 and
      $D000 with zeroes:
    </para>
 <programlisting>
        lda #$C0        ; Store #$C000 in mem (low byte first)
        sta mem+1
        lda #$00
        sta mem
        ldx #$04        ; x holds number of times to execute outer loop
        tay             ; accumulator and y are both 0
 loop:   sta (mem), y
        iny
        bne loop        ; Inner loop ends when y wraps around to 0
        inc mem+1       ; "Carry" from the iny to the core pointer
        dex             ; Decrement outer loop count, quit if done
        bne loop
 </programlisting>
    <para>
      Used carefully, proper use of the Y register can make your code
      smaller, faster, <emphasis>and</emphasis> more readable.  Used
      carelessly it can make your code an unreadable, unmaintainable
      mess.  Use it wisely, and with care, and it will be your
      greatest ally in writing flexible code.
    </para>
  </section>
 </section>
 <section>
  <title>What about Indexed Indirect?</title>
  <para>
    This essay has concerned itself almost exclusively with the
    Indirect Indexed&mdash;or (Indirect), Y&mdash;mode.  What about Indexed
    Indirect&mdash;(Indirect, X)?  This is a <emphasis>much</emphasis>
    less useful mode than the Y register's version.  While the Y
    register indirection lets you implement pointers and arrays in
    full generality, the X register is useful for pretty much only one
    application: lookup tables for single byte values.
  </para>
  <para>
    Even coming up with a motivating example for this is difficult,
    but here goes.  Suppose you have multiple, widely disparate
    sections of memory that you're watching for signals.  The
    following routine takes a resource index in the accumulator and
    returns the status byte for the corresponding resource.
  </para>
 <programlisting>
 ; This data is sitting on the zero page somewhere
 resource_status_table: .word resource0_status, resource1_status,
                       .word resource2_status, resource3_status,
                       ; etc. etc. etc.
 ; This is the actual program code
 .text
 getstatus:
        clc   ; Multiply argument by 2 before putting it in X, so that it
        asl   ; produces a value that's properly word-indexed
        tax
        lda (resource_status_table, x)
        rts
 </programlisting>
  <para>
    Why having a routine such as this is better than just having the
    calling routine access resourceN_status itself as an absolute
    memory load is left as an exercise for the reader.  That aside,
    this code fragment does serve as a reminder that when indexing an
    array of anything other than bytes, you must multiply your index
    by the size of the objects you want to index.  C does this
    automatically&mdash;assembler does not.  Stay sharp.
  </para>
 </section>
 <section>
  <title>Comparison with the other indexed forms</title>
  <para>
    Pointers are slow.  It sounds odd saying this, when C is the
    fastest language around on modern machines precisely because of
    its powerful and extensive use of pointers.  However, modern
    architectures are designed to be optimized for C-style code (as an
    example, the x86 architecture allows statements like <literal>mov
    eax, [bs+bx+4*di]</literal> as a single instruction), while the
    6502 is not.  An (Indirect, Y) operation can take up to 6 cycles
    to complete just on its own, while the preparation of that command
    costs additional time <emphasis>and</emphasis> scribbles over a
    bunch of registers, meaning memory operations to save the values
    and yet more time spent.  The simple code given at the beginning
    of this essay&mdash;loading <literal>*b</literal> into the
    accumulator&mdash;takes 7 cycles, not counting the 6 it takes to
    load b with the appropriate value to begin with.  If b is known to
    contain a specific value, we can write a single Absolute mode
    instruction to load its value, which takes only 4 cycles and also
    preserves the value in the Y register.  Clearly, Absolute mode
    should be used whenever possible.
  </para>
  <para>
    One might be tempted to use self-modifying code to solve this
    problem.  This actually doesn't pay off near enough for the hassle
    it generates; for self-modifying code, the address must be
    generated, then stored in the instruction, and then the data must
    be loaded.  Cost: 16 cycles for 2 immediate loads, 2 absolute
    stores, and 1 absolute load.  For the straight pointer
    dereference, we generate the address, store it in the pointer,
    clear the index, then dereference that.  Cost: 17 cycles for 3
    immediate loads, 2 zero page stores, and 1 indexed indirect load.
    Furthermore, unlike in the self-modifying case, loops where simple
    arithmetic is being continuously performed only require repeating
    the final load instruction, which allows for much greater time
    savings over an equivalent self-modifying loop.
  </para>
  <para>
    (This point is also completely moot for NES programmers or anyone
    else whose programs are sitting in ROM, because programs stored on
    a ROM cannot modify themselves.)
  </para>
 </section>
 <section>
  <title>Conclusion</title>
  <para>
    That's pretty much it for pointers.  Though they tend to make
    programs hairy, and learning how to properly deal with pointers is
    what separates real C programmers from the novices, the basic
    mechanics of them are not complex.  With pointers you can do
    efficient passing of large structures, pass-by-reference,
    complicated return values, and dynamic memory management&mdash;and
    now these wondrous toys may be added to your assembler programs,
    too (assuming you have that kind of space to play with).
  </para>
 </section>
 </chapter>
--- a/doc/hll4.sgm
+++ b/doc/hll4.sgm
@ -0,0 +1,270 @@
 <chapter>
 <title>Functionals</title>
 <para>
  This essay deals with indirect calls.  These are the core of an
  enormous number of high level languages: LISP's closures, C's
  function pointers, C++ and Java's virtual method calls, and some
  implementations of the <literal>switch</literal> statement.
 </para>
 <para>
  These techniques vary in complexity, and most will not be
  appropriate for large-scale assembler projects.  Of them, however,
  the Data-Directed approach is the most likely to lead to organized
  and maintainable code.
 </para>
 <section>
  <title>Function Pointers</title>
  <para>
    Because assembly language is totally untyped, function pointers
    are the same as any other sixteen-bit integer.  This makes
    representing them really quite easy; most assemblers should permit
    routines to be declared simply by naming the routine as
    a <literal>.word</literal> directly.
  </para>
  <para>
    To actually invoke these methods, copy them to some sixteen-bit
    location (say, <literal>target</literal>) and then invoking the
    method is a simple matter of the using an indirect jump:
    the <literal>JMP&nbsp;(target)</literal> instruction.
  </para>
  <para>
    There's really only one subtlety here, and it's that the indirect
    jump is an indirect <emphasis>jump</emphasis>, not an
    indirect <emphasis>function call</emphasis>.  Thus, if some
    function <literal>A</literal> makes in indirect jump to some
    routine, when that routine returns, it returns to whoever
    called <literal>A</literal>, not <literal>A</literal>
    itself.
  </para>
  <para>
    There are several ways of dealing with this, but only one correct
    way, which is to structure your procedures so that any call
    to <literal>JMP&nbsp;(xxxx)</literal> occurs at the very
    end.
  </para>
 </section>
 <section>
  <title>A quick digression on how subroutines work</title>
  <para>
    Ordinarily, subroutines are called with <literal>JSR</literal> and
    finished with <literal>RTS</literal>.  The <literal>JSR</literal>
    instruction takes its own address, adds 2 to it, and pushes this
    16-bit value on the stack, high byte first, then low byte (so that
    the low byte will be popped off first).
  </para>
  <para>
    But wait, you may object.  All <literal>JSR</literal> instructions
    are three bytes long.  This <quote>return address</quote> is in
    the middle of the instruction.  And you would be quite right;
    the <literal>RTS</literal> instruction pops off the 16-bit
    address, adds one to it, and <emphasis>then</emphasis> sets the
    program counter to that value.
  </para>
  <para>
    So it <emphasis>is</emphasis> possible to set up
    a <quote><literal>JSR</literal> indirect</quote> kind of operation
    by adding two to the indirect jump's address and then pushing that
    value onto the stack before making the jump; however, you wouldn't
    want to do this.  It takes six bytes and trashes your accumulator,
    and you can get the same functionality with half the space and
    with no register corruption by simply defining the indirect jump
    to be a one-instruction routine and <literal>JSR</literal>-ing to
    it directly.  As an added bonus, that way if you have multiple
    indirect jumps through the same pointer, you don't need to
    duplicate the jump instruction.
  </para>
  <para>
    Does this mean that abusing <literal>JSR</literal>
    and <literal>RTS</literal> is a dead-end, though?  Not at all...
  </para>
 </section>
 <section>
  <title>Dispatch-on-type and Data-Directed Assembler</title>
  <para>
    Most of the time, you care about function pointers because you've
    arranged them in some kind of table.  You hand it an index
    representing the type of your argument, or which method it is
    you're calling, or some other determinator, and then you index
    into an array of routines and execute the right one.
  </para>
  <para>
    Writing a generic routine to do this is kind of a pain.  First you
    have to pass a 16-bit pointer in, then you have to dereference it
    to figure out where your table is, then you have to do an indexed
    dereference on <emphasis>that</emphasis> to get the routine you
    want to run, then you need to copy it out to somewhere fixed so
    that you can write your jump instruction.  And making this
    non-generic doesn't help a whole lot, since that only saves you
    the first two steps, but now you have to write them out in every
    single indexed jump instruction.  If only there were some way to
    easily and quickly pass in a local pointer directly...
  </para>
  <para>
    Something, say, like the <literal>JSR</literal> instruction, only not for
    program code.
  </para>
  <para>
    Or we could just use the <literal>JSR</literal> statement itself,
    but only call this routine at the ends of other routines, much
    like we were organizing for indirect jumps to begin with.  This
    lets us set up routines that look like this:
  </para>
 <programlisting>
 jump'table'alpha:
    jsr do'jump'table
    .word alpha'0, alpha'1, alpha'2
 </programlisting>
  <para>
    Where the <literal>alpha'x</literal> routines are the ones to be
    called when the index has that value.  This leaves the
    implementation of do'jump'table, which in this case uses the Y
    register to hold the index:
  </para>
 <programlisting>
 do'jump'table:
    sta _scratch
    pla
    sta _jmpptr
    pla
    sta _jmpptr+1
    tya
    asl
    tay
    iny
    lda (_jmpptr), y
    sta _target
    iny
    lda (_jmpptr), y
    sta _target+1
    lda _scratch
    jmp (_target)
 </programlisting>
  <para>
    The <literal>TYA:ASL:TAY:INY</literal> sequence can actually be
    omitted if you don't mind having your Y indices be 1, 3, 5, 7, 9,
    etc., instead of 0, 1, 2, 3, 4, etc.  Likewise, the instructions
    dealing with <literal>_scratch</literal> can be omitted if you
    don't mind trashing the accumulator.  Keeping the accumulator and
    X register pristine for the target call comes in handy, though,
    because it means we can pass in a pointer argument purely in
    registers.  This will come in handy soon...
  </para>
 </section>
 <section>
  <title>VTables and Object-Oriented Assembler</title>
  <para>
    The usual technique for getting something that looks
    object-oriented in non-object-oriented languages is to fill a
    structure with function pointers, and have those functions take
    the structure itself as an argument.  This works just fine in
    assembler, of course (and doesn't really require anything more
    than your traditional jump-indirects), but it's also possible to
    use a lot of the standard optimizations that languages such as C++
    provide.
  </para>
  <para>
    The most important of these is the <emphasis>vtable</emphasis>.
    Each object type has its own vtable, and it's a list of function
    pointers for all the methods that type provides.  This is a space
    savings over the traditional structs-with-function-pointers
    approach because when you have many objects of the same class, you
    only have to represent the vtable once.  So that all objects may
    be treated identically, the vtable location is traditionally fixed
    as being the first entry in the corresponding structure.
  </para>
  <para>
    Virtual method invocation takes an object pointer (traditionally
    called <literal>self</literal> or <literal>this</literal>) and a
    method index and invokes the approprate method on that object.
    Gee, where have we seen that before?
  </para>
 <programlisting>
 sprite'vtable:
    jsr do'jump'table
    .word sprite'init, sprite'update, sprite'render
 </programlisting>
  <para>
    We mentioned before that vtables are generally the first entries
    in objects.  We can play another nasty trick here, paying an
    additional byte per object to have the vtable be not merely a
    pointer to its vtable routine, but an actual jump instruction to
    it.  (That is, if an object is at location X, then location X is
    the byte value <literal>$4C</literal>,
    representing <literal>JMP</literal>, location X+1 is the low byte
    of the vtable, and location X+2 is the high byte of the vtable.)
    Given that, our <literal>invokevirtual</literal> function becomes
    very simple indeed:
  </para>
 <programlisting>
 invokevirtual:
    sta this
    stx this+1
    jmp (this)
 </programlisting>
  <para>
    Which, combined with all our previous work here, takes
    the <literal>this</literal> pointer in <literal>.AX</literal> and
    a method identifier in <literal>.Y</literal> and invokes that
    method on that object.  Arguments besides <literal>this</literal>
    need to be set up before the call
    to <literal>invokevirtual</literal>, probably in some global
    argument array somewhere as discussed back in <xref linkend="hll2">.
  </para>
 </section>
 <section>
  <title>A final reminder</title>
  <para>
    We've been talking about all these routines as if they could be
    copy-pasted or hand-compiled from C++ or Java code.  This isn't
    really the case, primarily because <quote>local variables</quote>
    in your average assembler routines aren't really local, so
    multiple calls to the same method will tend to trash the program
    state.  And since a lot of the machinery described here shares a
    lot of memory (in particular, every single method invocation
    everywhere shares a <literal>this</literal>), attempting to shift
    over standard OO code into this format is likely to fail
    miserably.
  </para>
  <para>
    You can get an awful lot of flexibility out of even just one layer
    of method-calls, though, given a thoughtful
    design. The <literal>do'jump'table</literal> routine, or one very
    like it, was extremely common in NES games in the mid-1980s and
    later, usually as the beginning of the frame-update loop.
  </para>
  <para>
    If you find you really need multiple layers of method calls,
    though, then you really are going to need a full-on program stack,
    and that's going to be several kinds of mess.  That's the topic
    for the final chapter.
  </para>
 </section>
 </chapter>
--- a/doc/hll5.sgm
+++ b/doc/hll5.sgm
@ -0,0 +1,218 @@
 <chapter>
 <title>Call Stacks</title>
 <para>
  All our previous work has been assuming FORTRAN-style calling
  conventions.  In this, all procedure-local variables are actually
  secretly globals.  This means that a function that calls itself will
  end up stomping on its previous values, and everything will be
  hideously scrambled.  Various workarounds for this are covered
  in <xref linkend="hll2">.  Here, we solve the problem fully.
 </para>
 <section>
  <title>Recursion</title>
  <para>
    A procedure in C or other similar languages declares a chunk of
    storage that's unique to that invocation.  This chunk is just
    large enough to hold the return address and all the local
    variables, and is called the <emphasis>stack frame</emphasis>.
    Stack frames are arranged on a <emphasis>call stack</emphasis>;
    when a function is called, the stack grows with the new frame, and
    when that function returns, its frame is destroyed.  Once the main
    function returns, the stack is empty.
  </para>
  <para>
    Most modern architectures are designed to let you implement
    variable access like this directly, without touching the registers
    at all.  The x86 architecture even dedicates a register to
    function explicitly as the <emphasis>stack pointer</emphasis>, and
    then one could read, say, the fifth 16-bit variable into the
    register AX with the command <literal>MOV AX, [SP+10]</literal>.
  </para>
  <para>
    As we saw in <xref linkend="hll3">, the 6502 isn't nearly as
    convenient.  We'd need to keep the stack pointer somewhere on the
    zero page, then load the Y register with 10, then load the
    accumulator with an indexed-indirect call.  This is verbose, keeps
    trashing our registers, and it's very, very slow.
  </para>
  <para>
    So, in the spirit of programmers everywhere, we'll cheat.
  </para>
 </section>
 <section>
  <title>Our Goals</title>
  <para>
    The system we develop should have all of the following
    characteristics.
  </para>
  <itemizedlist>
    <listitem><para>It should be <emphasis>intuitive to program for</emphasis>.  The procedure bodies should be easily readable and writable by humans, even in assembler form.</para></listitem>
    <listitem><para>It should be <emphasis>efficient</emphasis>.  Variable accesses are very common, so procedures shouldn't cost much to run.</para></listitem>
    <listitem><para>It should allow <emphasis>multiple arity</emphasis> in both arguments and return values.  We won't require that an unlimited amount of information be passable, but it should allow more than the three bytes the registers give us.</para></listitem>
    <listitem><para>It should permit <emphasis>tail call elimination</emphasis>, an optimization that will allow certain forms of recursion to actually not grow the stack.</para></listitem>
  </itemizedlist>
  <para>
    Here is a system that meets all these properties.
  </para>
  <itemizedlist>
    <listitem><para>Reserve two bytes of the zero page for a stack pointer.  At the beginning of the program, set it to the top of memory.</para></listitem>
    <listitem><para>Divide the remainder of Zero Page into two parts:
        <itemizedlist>
          <listitem><para>The <emphasis>scratch space</emphasis>, which is where arguments and return values go, and which may be scrambled by any function call, and</para></listitem>
          <listitem><para>The <emphasis>local area</emphasis>, which all functions must restore to their initial state once finished.</para></listitem>
        </itemizedlist>
    </para></listitem>
    <listitem><para>Assign to each procedure a <emphasis>frame size</emphasis> S, which is a maximum size on the amount of the local area the procedure can use.  The procedure's variables will sit in the first S bytes of the local area.</para></listitem>
    <listitem><para>Upon entering the procedure, push the first S bytes of the local area onto the stack; upon exit, pop hose S bytes back on top of the local area.</para></listitem>
    <listitem><para>While the procedure is running, only touch the local area and the scratch space.</para></listitem>
  </itemizedlist>
  <para>This meets our design criteria neatly:</para>
  <itemizedlist>
    <listitem><para>It's as intuitive as such a system will get.  You have to call <literal>init'stack</literal> at the beginning, and you need to ensure that <literal>save'stack</literal> and <literal>restore'stack</literal> are called right.  The procedure's program text can pretend that it's just referring to its own variables, just like with the old style.  If a procedure doesn't call <emphasis>anyone</emphasis>, then it can just do all its work in the scratch space.</para></listitem>
    <listitem><para>It's efficient; the inside of the procedure is likely to be faster and smaller than its FORTRAN-style counterpart, because all variable references are on the Zero Page.</para></listitem>
    <listitem><para>Both arguments and return values can be as large as the scratch space.  It's not infinite, but it's probably good enough.</para></listitem>
    <listitem><para>Tail call elimination is possible; just restore the stack before making the JMP to the tail call target.</para></listitem>
  </itemizedlist>
  <para>
    The necessary support code is pretty straightforward.  The stack
    modification routines take the size of the frame in the
    accumulator, and while saving the local area, it copies over the
    corresponding values from the scratch space.  (This is because
    most functions will be wanting to keep their arguments around
    across calls.)
  </para>
 <programlisting>
 .scope
 ; Stack routines
 .data zp
 .space _sp      $02
 .space _counter $01
 .space fun'args $10
 .space fun'vars $40
 .text
 init'stack:
        lda     #$00
        sta     _sp
        lda     #$A0
        sta     _sp+1
        rts
 save'stack:
        sta     _counter
        sec
        lda     _sp
        sbc     _counter
        sta     _sp
        lda     _sp+1
        sbc     #$00
        sta     _sp+1
        ldy     #$00
 *       lda     fun'vars, y
        sta     (_sp), y
        lda     fun'args, y
        sta     fun'vars, y
        iny
        dec     _counter
        bne -
        rts
 restore'stack:
        pha
        sta     _counter
        ldy     #$00
 *       lda     (_sp), y
        sta     fun'vars, y
        iny
        dec     _counter
        bne -
        pla
        clc
        adc     _sp
        sta     _sp
        lda     _sp+1
        adc     #$00
        sta     _sp+1
        rts
 .scend
 </programlisting>
 </section>
 <section>
  <title>Example: Fibonnacci Numbers</title>
  <para>
    About the simplest <quote>interesting</quote> recursive function
    is the Fibonacci numbers.  The function fib(x) is defined as being
    1 if x is 0 or 1, and being fib(x-2)+fib(x-1) otherwise.
  </para>
  <para>
    Actually expressing it like that directly produces a very
    inefficient implementation, but it's a simple demonstration of the
    system.  Here's code for expressing the fib function:
  </para>
 <programlisting>
 .scope
 ; Uint16 fib (Uint8 x): compute Xth fibonnaci number.
 ; fib(0) = fib(1) = 1.
 ; Stack usage: 3.
 fib:    lda     #$03
        jsr     save'stack
        lda     fun'vars
        cmp     #$02
        bcc     _base
        dec     fun'args
        jsr     fib
        lda     fun'args
        sta     fun'vars+1
        lda     fun'args+1
        sta     fun'vars+2
        lda     fun'vars
        sec
        sbc     #$02
        sta     fun'args
        jsr     fib
        clc
        lda     fun'args
        adc     fun'vars+1
        sta     fun'args
        lda     fun'args+1
        adc     fun'vars+2
        sta     fun'args+1
        jmp     _done
 _base:  ldy     #$01
        sty     fun'args
        dey
        sty     fun'args+1
 _done:  lda     #$03
        jsr     restore'stack
        rts
 .scend
 </programlisting>
  <para>
    The full application, which deals with interfacing with CBM BASIC
    and handles console I/O and such, is in <xref linkend="fib-src"
    endterm="fib-fname">.
  </para>
 </section>
 </chapter>
--- a/doc/ophismanual.sgm
+++ b/doc/ophismanual.sgm
@ -9,21 +9,60 @@
  <!ENTITY samplecode SYSTEM "samplecode.sgm">
  <!ENTITY pre1 SYSTEM "preface.sgm">
  <!ENTITY cmdref SYSTEM "cmdref.sgm">
  <!ENTITY hll1 SYSTEM "hll1.sgm">
  <!ENTITY hll2 SYSTEM "hll2.sgm">
  <!ENTITY hll3 SYSTEM "hll3.sgm">
  <!ENTITY hll4 SYSTEM "hll4.sgm">
  <!ENTITY hll5 SYSTEM "hll5.sgm">
 ]>
 <book>
  <bookinfo>
    <title>Programming with Ophis</title>
    <author><firstname>Michael</firstname><surname>Martin</surname></author>
-    <copyright><year>2006-7</year><holder>Michael Martin</holder></copyright>
+    <copyright><year>2006-2012</year><holder>Michael Martin</holder></copyright>
  </bookinfo>
  &pre1;
-  &part1;
+  <part label="I">
-  &part2;
+    <title>Using the Ophis Assembler</title>
-  &part3;
+    <partintro>
-  &part4;
+      <para>
-  &part5;
+        The chapters in Part 1 are a tutorial guiding you through the
-  &part6;
+        features and programming model of the Ophis assembler. It uses
-  &part7;
+        the Commodore 64 as its target platform.
      </para>
      <para>
        This is not a tutorial on 6502 assembly language; those are
        available elsewhere.
      </para>
    </partintro>
    &part1;
    &part2;
    &part3;
    &part4;
    &part5;
    &part6;
    &part7;
  </part>
  <part label="II">
    <title>To HLL and Back</title>
    <partintro>
      <para>
        This is a compilation of an essay series I wrote from
        2002-2005 explaining how to apply HLL constructs from
        high-level languages in your assembly language projects.
      </para>
      <para>
        The examples have been updated and modernized for Ophis 2, and
        while the examples all target the Commodore 64, they are more
        generally applicable.
      </para>
    </partintro>
    &hll1;
    &hll2;
    &hll3;
    &hll4;
    &hll5;
  </part>
  &samplecode;
  &cmdref;
 </book>
--- a/doc/preface.sgm
+++ b/doc/preface.sgm
@ -1,6 +1,5 @@
 <preface>
  <title>Preface</title>
  <para>
    The Ophis project started on a lark back in 2001.  My graduate
    studies required me to learn Perl and Python, and I'd been playing
@ -8,42 +7,50 @@
    to learn both languages by writing a simple cross-assembler for
    the 6502 chip the C-64 used in both.
  </para>
  <para>
-    The Perl version was quickly abandoned, but the Python one slowly
+    The Perl one&mdash;uncreatively
-    grew in scope and power over the years, and by 2005 was a very
+    dubbed <quote>Perl65</quote>&mdash;was quickly abandoned, but the
-    powerful, flexible macro assembler that saw more use than I'd
+    Python one saw more work. When it came time to name it, one of the
-    expect.  In 2007 I finally got around to implementing the last few
+    things I had been hoping to do with the assembler was to produce
-    features I really wanted and polishing it up for general release.
+    working Apple II programs.  <quote>Ophis</quote> is Greek
    for <quote>snake</quote>, and a number of traditions also use it
    as the actual <emphasis>name</emphasis> of the serpent in the
    Garden of Eden.  So, Pythons, snakes, and stories involving really
    old Apples all combined to name the
    assembler.<footnote><para>Ironically, cross-platform development
    for the Apple II is extremely difficult, and while Ophis has been
    very successfully used to develop code for the Commodore 64,
    Nintendo Entertainment System, and Atari 2600, it has yet to
    actually be deployed on any of the Apples which inspired its
    name.</para></footnote>
  </para>
  <para>
-    Part of that process has been formatting the various little
+    Ophis slowly grew in scope and power over the years, and by 2005
-    tutorials and references I'd created into a single, unified
+    was a very powerful, flexible macro assembler that saw more use
-    document&mdash;the one you are now reading.
+    than I'd expect.  In 2007 Ophis 1.0 was formally released.
    However, Ophis was written for Python 2.1 and this became more and
    more untenable as time has gone by. As I started receiving patches
    for parts of Ophis, and as I used it for some projects of my own,
    it became clear that Ophis needed to be modernized and to become
    better able to interoperate with other toolchains. It was this
    process that led to Ophis 2.
  </para>
  <para>
    This is an updated edition of <emphasis>Programming With
    Ophis</emphasis>, including documentation for all new features
    introduced and expanding the examples to include simple
    demonstration programs for platforms besides the Commodore 64. It
    also includes updated versions of the <emphasis>To HLL and
    Back</emphasis> essays I wrote using Ophis and Perl65 as example
    languages.
  </para>
  <section>
    <title>Why <quote>Ophis</quote>?</title>
    <para>
      It's actually a kind of a horrific pun.  See, I was using Python
      at the time, and one of the things I had been hoping to do with
      the assembler was to produce working Apple II
      programs.  <quote>Ophis</quote> is Greek
      for <quote>snake</quote>, and a number of traditions also use it
      as the actual <emphasis>name</emphasis> of the serpent in the
      Garden of Eden.  So, Pythons, snakes, and stories involving
      really old Apples all combined to name the assembler.
    </para>
  </section>
  <section>
    <title>Getting a copy of Ophis</title>
    <para>
-      If you're reading this as part of the Ophis install, you clearly
+      As of this writing, the Ophis assembler is hosted at Github. The
-      already have it.  If not, as of this writing the homepage for
+      latest downloads and documentation will be available
-      the Ophis assembler
+      at <ulink url="http://github.com/michaelcmartin/Ophis"></ulink>. If
      is <ulink url="http://hkn.eecs.berkeley.edu/~mcmartin/ophis/"></ulink>. If
      this is out-of-date, a Web search on <quote>Ophis 6502
      assembler</quote> (without the quotation marks) should yield its
      page.
@ -58,17 +65,30 @@
      somewhere in your path.
    </para>
    <para>
-      Windows users that have Python installed can use the same source
+      For Windows users, a prepackaged system made
-      distributions that the other operating systems
+      with <command>py2exe</command> is also available.  The default
-      use; <command>ophis.bat</command> will arrange the environment
+      Windows installer will use this.  In this case, all you need to
-      variables accordingly and invoke the main script.
+      do is have <command>ophis.exe</command> in your path.
    </para>
  </section>
  <section>
    <title>About the examples</title>
    <para>
      Versions of the examples in this book are available from the Ophis site. Windows users will find them packaged with the distribution; all other users can get them as a separate download or pull them directly from github.
    </para>
    <para>
-      If you are on Windows and do not have Python installed, a
+      The code in this book is available in
-      prepackaged system made with <command>py2exe</command> is also
+      the <literal>examples/</literal> subdirectory, while extra
-      available.  The default Windows installer will use this.  In
+      examples will be in subdirectories of their own with brief
-      this case, all you need to do is
+      descriptions.
-      have <command>ophis.exe</command> in your path.
+    </para>
    <para>
      Most examples will require use of <emphasis>platform
      headers</emphasis>&mdash;standardized header files that set
      useful constants for the target system and, if needed, contain
      small programs to allow the program to be loaded and run. These
      are stored in the <literal>platform/</literal> subdirectory.
    </para>
  </section>
 </preface>
--- a/doc/samplecode.sgm
+++ b/doc/samplecode.sgm
@ -5,10 +5,11 @@
    of this manual.
  </para>
  <section id="tutor1-src">
-    <title id="tutor1-fname"><filename>tutor1.oph</filename></title>
+    <title id="tutor1-fname"><filename>hello1.oph</filename></title>
 <programlisting>
 .word $0801
 .org  $0801
 .outfile "hello.prg"
        .word next, 10          ; Next line and current line number
        .byte $9e," 2064",0     ; SYS 2064
@ -28,10 +29,11 @@ hello:  .byte "HELLO, WORLD!", 0
 </programlisting>
  </section>
  <section id="tutor2-src">
-    <title id="tutor2-fname"><filename>tutor2.oph</filename></title>
+    <title id="tutor2-fname"><filename>hello2.oph</filename></title>
 <programlisting>
 .word $0801
 .org  $0801
 .outfile "hello.prg"
 .scope
        .word _next, 10         ; Next line and current line number
@ -68,85 +70,86 @@ _next:  .word 0                 ; End of program
 .advance 2064
-.require "kernal.oph"
+.require "../platform/c64kernal.oph"
 </programlisting>
  </section>
  <section id="kernal-src">
-    <title id="kernal-fname"><filename>kernal.oph</filename></title>
+    <title id="kernal-fname"><filename>c64kernal.oph</filename></title>
 <programlisting>
 ; KERNAL routine aliases (C64)
-.alias acptr  $ffa5
+.alias  acptr           $ffa5
-.alias chkin  $ffc6
+.alias  chkin           $ffc6
-.alias chkout $ffc9
+.alias  chkout          $ffc9
-.alias chrin  $ffcf
+.alias  chrin           $ffcf
-.alias chrout $ffd2
+.alias  chrout          $ffd2
-.alias ciout  $ffa8
+.alias  ciout           $ffa8
-.alias cint   $ff81
+.alias  cint            $ff81
-.alias clall  $ffe7
+.alias  clall           $ffe7
-.alias close  $ffc3
+.alias  close           $ffc3
-.alias clrchn $ffcc
+.alias  clrchn          $ffcc
-.alias getin  $ffe4
+.alias  getin           $ffe4
-.alias iobase $fff3
+.alias  iobase          $fff3
-.alias ioinit $ff84
+.alias  ioinit          $ff84
-.alias listen $ffb1
+.alias  listen          $ffb1
-.alias load   $ffd5
+.alias  load            $ffd5
-.alias membot $ff9c
+.alias  membot          $ff9c
-.alias memtop $ff99
+.alias  memtop          $ff99
-.alias open   $ffc0
+.alias  open            $ffc0
-.alias plot   $fff0
+.alias  plot            $fff0
-.alias ramtas $ff87
+.alias  ramtas          $ff87
-.alias rdtim  $ffde
+.alias  rdtim           $ffde
-.alias readst $ffb7
+.alias  readst          $ffb7
-.alias restor $ff8a
+.alias  restor          $ff8a
-.alias save   $ffd8
+.alias  save            $ffd8
-.alias scnkey $ff9f
+.alias  scnkey          $ff9f
-.alias screen $ffed
+.alias  screen          $ffed
-.alias second $ff93
+.alias  second          $ff93
-.alias setlfs $ffba
+.alias  setlfs          $ffba
-.alias setmsg $ff90
+.alias  setmsg          $ff90
-.alias setnam $ffbd
+.alias  setnam          $ffbd
-.alias settim $ffdb
+.alias  settim          $ffdb
-.alias settmo $ffa2
+.alias  settmo          $ffa2
-.alias stop   $ffe1
+.alias  stop            $ffe1
-.alias talk   $ffb4
+.alias  talk            $ffb4
-.alias tksa   $ff96
+.alias  tksa            $ff96
-.alias udtim  $ffea
+.alias  udtim           $ffea
-.alias unlsn  $ffae
+.alias  unlsn           $ffae
-.alias untlk  $ffab
+.alias  untlk           $ffab
-.alias vector $ff8d
+.alias  vector          $ff8d
 ; Character codes for the colors.
-.alias color'0 144
+.alias  color'0         144
-.alias color'1 5
+.alias  color'1         5
-.alias color'2 28
+.alias  color'2         28
-.alias color'3 159
+.alias  color'3         159
-.alias color'4 156
+.alias  color'4         156
-.alias color'5 30
+.alias  color'5         30
-.alias color'6 31
+.alias  color'6         31
-.alias color'7 158
+.alias  color'7         158
-.alias color'8 129
+.alias  color'8         129
-.alias color'9 149
+.alias  color'9         149
-.alias color'10 150
+.alias  color'10        150
-.alias color'11 151
+.alias  color'11        151
-.alias color'12 152
+.alias  color'12        152
-.alias color'13 153
+.alias  color'13        153
-.alias color'14 154
+.alias  color'14        154
-.alias color'15 155
+.alias  color'15        155
 ; ...and reverse video
-.alias reverse'on 18
+.alias  reverse'on      18
-.alias reverse'off 146
+.alias  reverse'off     146
 ; ...and character set
-.alias upper'case 142
+.alias  upper'case      142
-.alias lower'case 14
+.alias  lower'case      14
 </programlisting>
  </section>
  <section id="tutor3-src">
-    <title id="tutor3-fname"><filename>tutor3.oph</filename></title>
+    <title id="tutor3-fname"><filename>hello3.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
        ldx #0
@ -194,9 +197,10 @@ target10: .byte "UNIVERSE", 0
 </programlisting>
  </section>
  <section id="tutor4a-src">
-    <title id="tutor4a-fname"><filename>tutor4a.oph</filename></title>
+    <title id="tutor4a-fname"><filename>hello4a.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
        ldx #0
@ -265,9 +269,10 @@ delay:  tax
 </programlisting>
  </section>
  <section id="tutor4b-src">
-    <title id="tutor4b-fname"><filename>tutor4b.oph</filename></title>
+    <title id="tutor4b-fname"><filename>hello4b.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
        ldx #0
@ -338,9 +343,10 @@ delay:  tax
 </programlisting>
  </section>
  <section id="tutor4c-src">
-    <title id="tutor4c-fname"><filename>tutor4c.oph</filename></title>
+    <title id="tutor4c-fname"><filename>hello4c.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
        ldx #0
@ -414,9 +420,10 @@ delay:  tax
 </programlisting>
  </section>
  <section id="tutor5-src">
-    <title id="tutor5-fname"><filename>tutor5.oph</filename></title>
+    <title id="tutor5-fname"><filename>hello5.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
 .outfile "hello.prg"
 .data
 .org $C000
@ -494,9 +501,10 @@ delay:  sta _tmp        ; save argument (rdtim destroys it)
 </programlisting>
  </section>
  <section id="tutor6-src">
-    <title id="tutor6-fname"><filename>tutor6.oph</filename></title>
+    <title id="tutor6-fname"><filename>hello6.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
 .outfile "hello.prg"
 .data
 .org $C000
@ -601,43 +609,63 @@ _done:  rts
 </programlisting>
  </section>
  <section id="c64-2-src">
-    <title id="c64-2-fname"><filename>c64-2.oph</filename></title>
+    <title id="c64-2-fname"><filename>c64_0.oph</filename></title>
 <programlisting>
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Commodore 64 Basic Runtime File
 ;;
 ;; Include this at the TOP of your C64 program, and it will handle
 ;; hiding away the BASIC ROM and data and restoring it at the end.
 ;;
 ;; You will have a contiguous block of RAM from $0800 to $CF81, and
 ;; Zero Page access from $02 to $7F in the segment "zp".
 .word $0801
 .org  $0801
 ; BASIC program that just calls our machine language code
 .scope
        .word _next, 10         ; Next line and current line number
-        .byte $9e," 2064",0     ; SYS 2064
+        .byte $9e," 2062",0     ; SYS 2062
 _next:  .word 0                 ; End of program
 .scend
-.advance $0810
+.data zp ; Zero Page memory segment.
 .require "kernal.oph"
 .data zp
 .org $0002
 .text
 .scope
-        ; Cache BASIC's zero page at top of available RAM.
+        ; Cache BASIC zero page at top of available RAM
-        ldx #$7E
+        ldx     #$7E
-*       lda $01, x
+*       lda     $01, x
-        sta $CF81, x
+        sta     $CF81, x
        dex
-        bne -
+        bne     -
-        jsr _main
+        ; Swap out the BASIC ROM for RAM
        lda     $01
        and     #$fe
        ora     #$06
        sta     $01
-        ; Restore BASIC's zero page and return control.
+        ; Run the real program
        jsr     _main
-        ldx #$7E
+        ; Restore BASIC ROM
-*       lda $CF81, x
+        lda     $01
-        sta $01, x
+        ora     #$07
        sta     $01
        ; Restore BASIC zero page
        ldx     #$7E
 *       lda     $CF81, x
        sta     $01, x
        dex
-        bne -
+        bne     -
        ; Back to BASIC
        rts
 _main:
@ -646,9 +674,11 @@ _main:
 </programlisting>
  </section>
  <section id="tutor7-src">
-    <title id="tutor7-fname"><filename>tutor7.oph</filename></title>
+    <title id="tutor7-fname"><filename>hello7.oph</filename></title>
 <programlisting>
-.include "c64-2.oph"
+.include "../platform/c64_0.oph"
 .require "../platform/c64kernal.oph"
 .outfile "hello.prg"
 .data
 .org $C000
@ -744,6 +774,461 @@ _done:  rts
 .data zp
 .checkpc $80
 </programlisting>
  </section>
  <section id="structure-src">
    <title id="structure-fname"><filename>structuredemo.oph</filename></title>
 <programlisting>
 .include "../platform/c64_0.oph"
 .require "../platform/c64kernal.oph"
 .outfile "structuredemo.prg"
        jsr print'unsorted
        jsr insertion'sort
        jsr print'list
        rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Linked list data: head, next, lb, hb.
 ; lb/hb: Low/high bytes of the data array.  These are immutable and
 ;        kept with the program text.
 ; head:  Array index of the first element in the list, or #$FF if the
 ;        list is empty
 ; next:  Array of successor indices.  If you've just read element X,
 ;        the value of memory location next+X is the index of the
 ;        next element.  If next is #$FF, you've reached the end of
 ;        the list.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data
 .org    $C000
 .space  head    1
 .space  next    16
 .text
 lb:   .byte &lt;$838,&lt;$618,&lt;$205,&lt;$984,&lt;$724,&lt;$301,&lt;$249,&lt;$946
      .byte &lt;$925,&lt;$043,&lt;$114,&lt;$697,&lt;$985,&lt;$633,&lt;$312,&lt;$086
 hb:   .byte &gt;$838,&gt;$618,&gt;$205,&gt;$984,&gt;$724,&gt;$301,&gt;$249,&gt;$946
      .byte &gt;$925,&gt;$043,&gt;$114,&gt;$697,&gt;$985,&gt;$633,&gt;$312,&gt;$086
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; insertion'sort:  Sorts the list defined by head, next, hb, lb.
 ; Arguments:  None.
 ; Modifies:   All registers destroyed, head and next array sorted.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 insertion'sort:
        lda #$FF        ; Clear list by storing the terminator in 'head'
        sta head
        ldx #$0         ; Loop through the lb/hb array, adding each
 insertion'sort'loop:    ; element one at a time
        txa
        pha
        jsr insert_elt
        pla
        tax
        inx
        cpx #$10
        bne insertion'sort'loop
        rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; insert_elt: Insert an element into the linked list.  Maintains the
 ;             list in sorted, ascending order.  Used by
 ;             insertion'sort.
 ; Arguments:  X register holds the index of the element to add.
 ; Modifies:   All registers destroyed; head and next arrays updated
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data
 .space lbtoinsert 1
 .space hbtoinsert 1
 .space indextoinsert 1
 .text
 insert_elt:
        ldy head                        ; If the list is empty, make
        cpy #$FF                        ; head point at it, and return.
        bne insert_elt'list'not'empty
        stx head
        tya
        sta next,x
        rts
 insert_elt'list'not'empty:
        lda lb,x                        ; Cache the data we're inserting
        sta lbtoinsert
        lda hb,x
        sta hbtoinsert
        stx indextoinsert
        ldy head                        ; Compare the first value with
        sec                             ; the data.  If the data must
        lda lb,y                        ; be inserted at the front...
        sbc lbtoinsert
        lda hb,y
        sbc hbtoinsert
        bmi insert_elt'not'smallest
        tya                             ; Set its next pointer to the
        sta next,x                      ; old head, update the head
        stx head                        ; pointer, and return.
        rts
 insert_elt'not'smallest:
        ldx head
 insert_elt'loop:                        ; At this point, we know that
        lda next,x                      ; argument &gt; data[X].
        tay
        cpy #$FF                        ; if next[X] = #$FF, insert arg at end.
        beq insert_elt'insert'after'current
        lda lb,y                        ; Otherwise, compare arg to
        sec                             ; data[next[X]].  If we insert
        sbc lbtoinsert                  ; before that...
        lda hb,y
        sbc hbtoinsert
        bmi insert_elt'goto'next
 insert_elt'insert'after'current:        ; Fix up all the next links
        tya
        ldy indextoinsert
        sta next,y
        tya
        sta next,x
        rts                             ; and return.
 insert_elt'goto'next:                   ; Otherwise, let X = next[X]
        tya                             ; and go looping again.
        tax
        jmp insert_elt'loop
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; print'unsorted: Steps through the data array and prints each value.
 ; Standalone procedure.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 print'unsorted:
        lda #&lt;unsorted'hdr
        ldx #&gt;unsorted'hdr
        jsr put'string
        ldy #$00
 print'unsorted'loop:
        lda hb, Y
        jsr print'hex
        lda lb, y
        jsr print'hex
        lda #$20
        jsr chrout
        iny
        cpy #$10
        bne print'unsorted'loop
        lda #$0D
        jsr chrout
        rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; print'list: Starts at head, and prints out every value in the
 ;             linked list.
 ; Standalone procedure.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 print'list:
        lda #&lt;sorted'hdr
        ldx #&gt;sorted'hdr
        jsr put'string
        ldy head
 print'list'loop:
        cpy #$FF
        beq print'list'done
        lda hb, y
        jsr print'hex
        lda lb, y
        jsr print'hex
        lda #$20
        jsr chrout
        lda next, Y
        tay
        jmp print'list'loop
 print'list'done:
        lda #$0d
        jsr chrout
        rts
 ;; String data for the above routines.
 unsorted'hdr:
        .byte 147               ; Clear screen first!
        .byte "UNSORTED DATA:",13,0
 sorted'hdr:
        .byte "SORTED DATA:",13,0
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; print'hex: outputs a two-character hex representation of a one-
 ;            byte value.
 ; Arguments: Byte to print in accumulator
 ; Modifies: .A and .X
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 print'hex:
        pha
        clc
        lsr
        lsr
        lsr
        lsr
        tax
        lda hexstr,x
        jsr chrout
        pla
        and #$0F
        tax
        lda hexstr,X
        jsr chrout
        rts
 ; Character data array for print'hex.
 hexstr: .byte "0123456789ABCDEF"
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; put'string: outputs a C-style null terminated string with length
 ;             less than 256 to the screen.  If 256 bytes are written
 ;             without finding a terminator, the routine ends quietly.
 ; Arguments: Low byte of string address in .A, high byte in .X
 ; Modifies: .A and .Y
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data zp
 .space put'string'addr 2
 .text
 put'string:
        sta put'string'addr
        stx put'string'addr+1
        ldy #$00
 put'string'loop:
        lda (put'string'addr),y
        beq put'string'done
        jsr chrout
        iny
        bne put'string'loop
 put'string'done:
        rts
 </programlisting>
  </section>
  <section id="fib-src">
    <title id="fib-fname"><filename>fibonacci.oph</filename></title>
 <programlisting>
 .include "../platform/c64_0.oph"
 .require "../platform/c64kernal.oph"
 .outfile "fibonacci.prg"
        lda     #&lt;opening     ; Print opening text
        sta     fun'args
        lda     #&gt;opening
        sta     fun'args+1
        jsr     print'string
        lda     #$00
        sta     fun'vars      ; Count num from 0 to 19
 *       lda     fun'vars      ; Main loop: print num, with leading space if &lt;10
        cmp     #$09
        bcs     +
        lda     #$20
        jsr     chrout
        lda     fun'vars
 *       sta     fun'args      ; Copy num to args, print it, plus ": "
        inc     fun'args
        lda     #$00
        sta     fun'args+1
        jsr     print'dec
        lda     #$3A
        jsr     chrout
        lda     #$20
        jsr     chrout
        lda     fun'vars      ; Copy num to args, call fib, print result
        sta     fun'args
        jsr     fib
        jsr     print'dec
        lda     #$0D          ; Newline
        jsr     chrout
        inc     fun'vars      ; Increment num; if it's 20, we're done.
        lda     fun'vars
        cmp     #20
        bne     --            ; Otherwise, loop.
        rts
 opening:
 .byte   147, "           FIBONACCI SEQUENCE",13,13,0
 .scope
 ; Uint16 fib (Uint8 x): compute Xth fibonnaci number.
 ; fib(0) = fib(1) = 1.
 ; Stack usage: 3.
 fib:    lda     #$03
        jsr     save'stack
        lda     fun'vars    ; If x &lt; 2, goto _base.
        cmp     #$02
        bcc     _base
        dec     fun'args    ; Otherwise, call fib(x-1)...
        jsr     fib
        lda     fun'args    ; Copy the result to local variable...
        sta     fun'vars+1
        lda     fun'args+1
        sta     fun'vars+2
        lda     fun'vars    ; Call fib(x-2)...
        sec
        sbc     #$02
        sta     fun'args
        jsr     fib
        clc                 ; And add the old result to it, leaving it
        lda     fun'args    ; in the 'result' location.
        adc     fun'vars+1
        sta     fun'args
        lda     fun'args+1
        adc     fun'vars+2
        sta     fun'args+1
        jmp     _done       ; and then we're done.
 _base:  ldy     #$01        ; In the base case, just copy 1 to the
        sty     fun'args    ; result.
        dey
        sty     fun'args+1
 _done:  lda     #$03
        jsr     restore'stack
        rts
 .scend
 .scope
 ; Stack routines: init'stack, save'stack, restore'stack
 .data zp
 .space _sp      $02
 .space _counter $01
 .space fun'args $10
 .space fun'vars $40
 .text
 init'stack:
        lda     #$00
        sta     _sp
        lda     #$A0
        sta     _sp+1
        rts
 save'stack:
        sta     _counter
        sec
        lda     _sp
        sbc     _counter
        sta     _sp
        lda     _sp+1
        sbc     #$00
        sta     _sp+1
        ldy     #$00
 *       lda     fun'vars, y
        sta     (_sp), y
        lda     fun'args, y
        sta     fun'vars, y
        iny
        dec     _counter
        bne -
        rts
 restore'stack:
        pha
        sta     _counter
        ldy     #$00
 *       lda     (_sp), y
        sta     fun'vars, y
        iny
        dec     _counter
        bne -
        pla
        clc
        adc     _sp
        sta     _sp
        lda     _sp+1
        adc     #$00
        sta     _sp+1
        rts
 .scend
 ; Utility functions.  print'dec prints an unsigned 16-bit integer.
 ; It's ugly and long, mainly because we don't bother with niceties
 ; like "division".  print'string prints a zero-terminated string.
 .scope
 .data
 .org    fun'args
        .space  _val            2
        .space  _step           2
        .space  _res            1
        .space  _allowzero      1
 .text
 print'dec:
        lda     #$00
        sta     _allowzero
        lda     #&lt;10000
        sta     _step
        lda     #&gt;10000
        sta     _step+1
        jsr     repsub'16
        lda     #&lt;1000
        sta     _step
        lda     #&gt;1000
        sta     _step+1
        jsr     repsub'16
        lda     #0
        sta     _step+1
        lda     #100
        sta     _step
        jsr     repsub'16
        lda     #10
        sta     _step
        jsr     repsub'16
        lda     _val
        jsr     _print
        rts
 repsub'16:
        lda     #$00
        sta     _res
 *       lda     _val
        sec
        sbc     _step
        lda     _val+1
        sbc     _step+1
        bcc     _done
        lda     _val
        sec
        sbc     _step
        sta     _val
        lda     _val+1
        sbc     _step+1
        sta     _val+1
        inc     _res
        jmp     -
 _done:  lda     _res
        ora     _allowzero
        beq     _ret
        sta     _allowzero
        lda     _res
 _print: clc
        adc     #'0
        jsr     chrout
 _ret:   rts
 .scend
 print'string:
        ldy     #$00
 *       lda     (fun'args), y
        beq     +
        jsr     chrout
        iny
        jmp     -
 *       rts
 </programlisting>
  </section>
 </appendix>
--- a/doc/tutor1.sgm
+++ b/doc/tutor1.sgm
@ -256,6 +256,31 @@ hello:  .byte "HELLO, WORLD!", 0
       summary of available command line options.
    </para>
    <para>
      Ophis takes a list of source files and produces an output file
      based on assembling each file you give it, in order. You can add
      a line to your program like this to name the output file:
    </para>
 <programlisting>
 .outfile "hello.prg"
 </programlisting>
    <para>
      Alternately, you can use the <option>-o</option> option on the
      command line. This will override any <literal>.outfile</literal>
      directives. If you don't specify any name, it will put the
      output into a file named <filename>ophis.bin</filename>.
    </para>
    <para>
      If you are using Ophis as part of some larger toolchain, you can
      also make it run in <emphasis>pipe mode</emphasis>. If you give
      a dash <option>-</option> as an input file or as the output
      target, Ophis will use standard input or output for
      communication.
    </para>
    <table frame="all">
      <title>Ophis Options</title>
      <tgroup cols='2'>
@ -266,13 +291,11 @@ hello:  .byte "HELLO, WORLD!", 0
          </row>
        </thead>
        <tbody>
-          <row><entry><option>-6510</option></entry><entry>Allows the 6510 undocumented opcodes as listed in the VICE documentation.</entry></row>
+          <row><entry><option>-o FILE</option></entry><entry>Overrides the default filename for output.</entry></row>
-          <row><entry><option>-65c02</option></entry><entry>Allows opcodes and addressing modes added by the 65C02.</entry></row>
+          <row><entry><option>-u</option></entry><entry>Allows the 6510 undocumented opcodes as listed in the VICE documentation.</entry></row>
-          <row><entry><option>-v 0</option></entry><entry>Quiet operation.  Only reports errors.</entry></row>
+          <row><entry><option>-c</option></entry><entry>Allows opcodes and addressing modes added by the 65C02.</entry></row>
-          <row><entry><option>-v 1</option></entry><entry>Default operation.  Reports files as they are loaded, and gives statistics on the final output.</entry></row>
+          <row><entry><option>-q</option></entry><entry>Quiet operation.  Only reports warnings and errors.</entry></row>
-          <row><entry><option>-v 2</option></entry><entry>Verbose operation.  Names each assembler pass as it runs.</entry></row>
+          <row><entry><option>-v</option></entry><entry>Verbose operation.  Reports files as they are loaded.</entry></row>
          <row><entry><option>-v 3</option></entry><entry>Debug operation:  Dumps the entire IR after each pass.</entry></row>
          <row><entry><option>-v 4</option></entry><entry>Full debug operation:  Dumps the entire IR and symbol table after each pass.</entry></row>
        </tbody>
      </tgroup>
    </table>
@ -283,30 +306,16 @@ hello:  .byte "HELLO, WORLD!", 0
      here:
    </para>
    <screen>
-localhost$ ophis tutor1.oph tutor1.prg -v 2
+localhost$ ophis -v hello1.oph
-Loading tutor1.oph
+Loading hello1.oph
 Running: Macro definition pass
 Running: Macro expansion pass
 Running: Label initialization pass
 Fixpoint failed, looping back
 Running: Label initialization pass
 Running: Circularity check pass
 Running: Expression checking pass
 Running: Easy addressing modes pass
 Running: Label Update Pass
 Fixpoint failed, looping back
 Running: Label Update Pass
 Running: Instruction Collapse Pass
 Running: Mode Normalization pass
 Running: Label Update Pass
 Running: Assembler
 Assembly complete: 45 bytes output (14 code, 29 data, 2 filler)
    </screen>
    <para>
-      If your emulator can run <filename>PRG</filename> files
+      This will produce a file named <filename>hello.prg</filename>. If
-      directly, this file will now run (and
+      your emulator can run <filename>PRG</filename> files directly,
-      print <computeroutput>HELLO, WORLD!</computeroutput>) as many
+      this file will now run (and print <computeroutput>HELLO,
-      times as you type <userinput>RUN</userinput>.  Otherwise, use
+      WORLD!</computeroutput>) as many times as you
      type <userinput>RUN</userinput>.  Otherwise, use
      a <filename>D64</filename> management utility to put
      the <filename>PRG</filename> on a <filename>D64</filename>, then
      load and run the file off that.
--- a/doc/tutor3.sgm
+++ b/doc/tutor3.sgm
@ -53,7 +53,10 @@
      the KERNAL values are standard, we do not reproduce them here.
      (The files in question are <xref linkend="c64-1-src"
      endterm="c64-1-fname"> and <xref linkend="kernal-src"
-      endterm="kernal-fname">.)
+      endterm="kernal-fname">.) The <filename>c64kernal.oph</filename>
      header is likely to be useful in your own projects, and it is
      available in the <literal>platform/</literal> directory for easy
      inclusion.
    </para>
  </section>
  <section>
--- a/doc/tutor4.sgm
+++ b/doc/tutor4.sgm
@ -64,11 +64,11 @@ target10: .byte "Universe", 0
    and lowercase are reversed, so we have messages
    like <computeroutput>hELLO, sOLAR sYSTEM!</computeroutput>.  For
    the specific case of PETSCII, we can just fix our strings, but
-    that's less of an option if we're writing for the Apple II's
+    that's less of an option if we're writing for a game console that
-    character set, or targeting a game console that puts its letters
+    puts its letters in arbitrary locations.  We need to remap how
-    in arbitrary locations.  We need to remap how strings are turned
+    strings are turned into byte values.
-    into byte values.  The <literal>.charmap</literal>
+    The <literal>.charmap</literal> and <literal>.charmapbin</literal>
-    and <literal>.charmapbin</literal> directives do what we need.
+    directives do what we need.
  </para>
  <para>
@ -102,9 +102,6 @@ target10: .byte "Universe", 0
    specifies an external file, 256 bytes long, that is loaded in at
    that point.  A binary character map for the Commodore 64 is
    provided with the sample programs
-    as <filename>petscii.map</filename>. There are also three
+    as <filename>petscii.map</filename>.
    files, <filename>a2normal.map</filename>, <filename>a2inverse.map</filename>,
    and <filename>a2blink.map</filename> that handle the Apple II's
    very nonstandard character encodings.
  </para>
 </chapter>
--- a/doc/tutor7.sgm
+++ b/doc/tutor7.sgm
@ -48,27 +48,46 @@
      locations $02-$7F are used by the BASIC interpreter, and
      locations $80-$FF are used by the KERNAL.  We don't need the
      BASIC interpreter, though, so we can back up all of $02-$7F at
-      the start of our program and restore it all when we're done:
+      the start of our program and restore it all when we're done.
    </para>
    <para>
      In fact, since we're disablng BASIC, we can actually also swap
      out its ROM entirely and get a contiguous block of RAM from
      $0002 to $CFFF:
    </para>
 <programlisting>
 .scope
-        ; Cache BASIC's zero page at top of available RAM.
+        ; Cache BASIC zero page at top of available RAM
-        ldx #$7E
+        ldx     #$7E
-*       lda $01, x
+*       lda     $01, x
-        sta $CF81, x
+        sta     $CF81, x
        dex
-        bne -
+        bne     -
-        jsr _main
+        ; Swap out the BASIC ROM for RAM
        lda     $01
        and     #$fe
        ora     #$06
        sta     $01
-        ; Restore BASIC's zero page and return control.
+        ; Run the real program
        jsr     _main
-        ldx #$7E
+        ; Restore BASIC ROM
-*       lda $CF81, x
+        lda     $01
-        sta $01, x
+        ora     #$07
        sta     $01
        ; Restore BASIC zero page
        ldx     #$7E
 *       lda     $CF81, x
        sta     $01, x
        dex
-        bne -
+        bne     -
        ; Back to BASIC
        rts
 _main:
@ -79,7 +98,9 @@ _main:
    <para>
      The new, improved header file is <xref linkend="c64-2-src"
-      endterm="c64-2-fname">.
+      endterm="c64-2-fname">. This,
      like <filename>c64kernal.oph</filename>, is available for use in
      your own projects in the <literal>platform/</literal> directory.
    </para>
    <para>
--- a/examples/fibonacci.oph
+++ b/examples/fibonacci.oph
@ -0,0 +1,213 @@
 .include "../platform/c64_0.oph"
 .require "../platform/c64kernal.oph"
 .outfile "fibonacci.prg"
 	lda	#<opening     ; Print opening text
 	sta	fun'args
 	lda	#>opening
 	sta	fun'args+1
 	jsr	print'string
 	lda	#$00
 	sta	fun'vars      ; Count num from 0 to 19
 *	lda	fun'vars      ; Main loop: print num, with leading space if <10
 	cmp	#$09
 	bcs	+
 	lda	#$20
 	jsr	chrout
 	lda	fun'vars
 *	sta	fun'args      ; Copy num to args, print it, plus ": "
 	inc	fun'args
 	lda	#$00
 	sta	fun'args+1
 	jsr	print'dec
 	lda	#$3A
 	jsr	chrout
 	lda	#$20
 	jsr	chrout
 	lda	fun'vars      ; Copy num to args, call fib, print result
 	sta	fun'args
 	jsr	fib
 	jsr	print'dec
 	lda	#$0D          ; Newline
 	jsr	chrout
 	inc	fun'vars      ; Increment num; if it's 20, we're done.
 	lda	fun'vars
 	cmp	#20
 	bne	--            ; Otherwise, loop.
 	rts
 opening:
 .byte	147, "           FIBONACCI SEQUENCE",13,13,0
 .scope
 ; Uint16 fib (Uint8 x): compute Xth fibonnaci number.
 ; fib(0) = fib(1) = 1.
 ; Stack usage: 3.
 fib:	lda	#$03
 	jsr	save'stack
 	lda	fun'vars    ; If x < 2, goto _base.
 	cmp	#$02
 	bcc	_base
 	dec	fun'args    ; Otherwise, call fib(x-1)...
 	jsr	fib
 	lda	fun'args    ; Copy the result to local variable...
 	sta	fun'vars+1
 	lda	fun'args+1
 	sta	fun'vars+2
 	lda	fun'vars    ; Call fib(x-2)...
 	sec
 	sbc	#$02
 	sta	fun'args
 	jsr	fib
 	clc                 ; And add the old result to it, leaving it
 	lda	fun'args    ; in the 'result' location.
 	adc	fun'vars+1
 	sta	fun'args
 	lda	fun'args+1
 	adc	fun'vars+2
 	sta	fun'args+1
 	jmp	_done       ; and then we're done.
 _base:	ldy	#$01        ; In the base case, just copy 1 to the
 	sty	fun'args    ; result.
 	dey
 	sty	fun'args+1
 _done:	lda	#$03
 	jsr	restore'stack
 	rts
 .scend
 .scope
 ; Stack routines: init'stack, save'stack, restore'stack
 .data zp
 .space _sp 	$02
 .space _counter	$01
 .space fun'args $10
 .space fun'vars $40
 .text
 init'stack:
 	lda	#$00
 	sta	_sp
 	lda	#$A0
 	sta	_sp+1
 	rts
 save'stack:
 	sta	_counter
 	sec
 	lda	_sp
 	sbc	_counter
 	sta	_sp
 	lda	_sp+1
 	sbc	#$00
 	sta	_sp+1
 	ldy	#$00
 *	lda	fun'vars, y
 	sta	(_sp), y
 	lda	fun'args, y
 	sta	fun'vars, y
 	iny
 	dec	_counter
 	bne -
 	rts
 restore'stack:
 	pha
 	sta	_counter
 	ldy	#$00
 *	lda	(_sp), y
 	sta	fun'vars, y
 	iny
 	dec	_counter
 	bne -
 	pla
 	clc
 	adc	_sp
 	sta	_sp
 	lda	_sp+1
 	adc	#$00
 	sta	_sp+1
 	rts
 .scend
 ; Utility functions.  print'dec prints an unsigned 16-bit integer.
 ; It's ugly and long, mainly because we don't bother with niceties
 ; like "division".  print'string prints a zero-terminated string.
 .scope
 .data
 .org 	fun'args
 	.space	_val		2
 	.space	_step		2
 	.space	_res		1
 	.space	_allowzero	1
 .text
 print'dec:
 	lda 	#$00
 	sta	_allowzero
 	lda	#<10000
 	sta	_step
 	lda 	#>10000
 	sta 	_step+1
 	jsr 	repsub'16
 	lda	#<1000
 	sta	_step
 	lda 	#>1000
 	sta 	_step+1
 	jsr 	repsub'16
 	lda	#0
 	sta	_step+1
 	lda 	#100
 	sta 	_step
 	jsr 	repsub'16
 	lda 	#10
 	sta 	_step
 	jsr 	repsub'16
 	lda 	_val
 	jsr 	_print
 	rts
 repsub'16:
 	lda	#$00
 	sta	_res
 *	lda	_val
 	sec
 	sbc	_step
 	lda	_val+1
 	sbc	_step+1
 	bcc	_done
 	lda	_val
 	sec
 	sbc	_step
 	sta	_val
 	lda	_val+1
 	sbc	_step+1
 	sta	_val+1
 	inc	_res
 	jmp	-
 _done:	lda	_res
 	ora	_allowzero
 	beq	_ret
 	sta	_allowzero
 	lda	_res
 _print:	clc
 	adc	#'0
 	jsr	chrout
 _ret:	rts
 .scend
 print'string:
 	ldy	#$00
 *	lda	(fun'args), y
 	beq	+
 	jsr	chrout
 	iny
 	jmp	-
 *	rts
--- a/examples/hello1.oph
+++ b/examples/hello1.oph
@ -1,5 +1,6 @@
 .word $0801
 .org  $0801
 .outfile "hello.prg"
 	.word next, 10		; Next line and current line number
 	.byte $9e," 2064",0	; SYS 2064
--- a/examples/hello2.oph
+++ b/examples/hello2.oph
@ -1,5 +1,6 @@
 .word $0801
 .org  $0801
 .outfile "hello.prg"
 .scope
 	.word _next, 10		; Next line and current line number
--- a/examples/hello3.oph
+++ b/examples/hello3.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
 	ldx #0
--- a/examples/hello4a.oph
+++ b/examples/hello4a.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
 	ldx #0
--- a/examples/hello4b.oph
+++ b/examples/hello4b.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
 	ldx #0
--- a/examples/hello4c.oph
+++ b/examples/hello4c.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
 .outfile "hello.prg"
 .macro print
 	ldx #0
--- a/examples/hello5.oph
+++ b/examples/hello5.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
 .outfile "hello.prg"
 .data
 .org $C000
--- a/examples/hello6.oph
+++ b/examples/hello6.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
 .outfile "hello.prg"
 .data
 .org $C000
--- a/examples/structuredemo.oph
+++ b/examples/structuredemo.oph
@ -0,0 +1,232 @@
 .include "../platform/c64_0.oph"
 .require "../platform/c64kernal.oph"
 .outfile "structuredemo.prg"
 	jsr print'unsorted
 	jsr insertion'sort
 	jsr print'list
 	rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Linked list data: head, next, lb, hb.
 ; lb/hb: Low/high bytes of the data array.  These are immutable and
 ;        kept with the program text.
 ; head:  Array index of the first element in the list, or #$FF if the
 ;        list is empty
 ; next:  Array of successor indices.  If you've just read element X,
 ;        the value of memory location next+X is the index of the
 ;        next element.  If next is #$FF, you've reached the end of
 ;        the list.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data
 .org	$C000
 .space	head	1
 .space	next	16
 .text
 lb:   .byte <$838,<$618,<$205,<$984,<$724,<$301,<$249,<$946
      .byte <$925,<$043,<$114,<$697,<$985,<$633,<$312,<$086
 hb:   .byte >$838,>$618,>$205,>$984,>$724,>$301,>$249,>$946
      .byte >$925,>$043,>$114,>$697,>$985,>$633,>$312,>$086
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; insertion'sort:  Sorts the list defined by head, next, hb, lb.
 ; Arguments:  None.
 ; Modifies:   All registers destroyed, head and next array sorted.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 insertion'sort:
 	lda #$FF	; Clear list by storing the terminator in 'head'
 	sta head
 	ldx #$0		; Loop through the lb/hb array, adding each
 insertion'sort'loop:    ; element one at a time
 	txa
 	pha
 	jsr insert_elt
 	pla
 	tax
 	inx
 	cpx #$10
 	bne insertion'sort'loop
 	rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; insert_elt: Insert an element into the linked list.  Maintains the
 ;             list in sorted, ascending order.  Used by
 ;             insertion'sort.
 ; Arguments:  X register holds the index of the element to add.
 ; Modifies:   All registers destroyed; head and next arrays updated
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data
 .space lbtoinsert 1
 .space hbtoinsert 1
 .space indextoinsert 1
 .text
 insert_elt:
 	ldy head			; If the list is empty, make
 	cpy #$FF			; head point at it, and return.
 	bne insert_elt'list'not'empty
 	stx head
 	tya
 	sta next,x
 	rts
 insert_elt'list'not'empty:
 	lda lb,x			; Cache the data we're inserting
 	sta lbtoinsert
 	lda hb,x
 	sta hbtoinsert
 	stx indextoinsert
 	ldy head			; Compare the first value with
 	sec				; the data.  If the data must
 	lda lb,y			; be inserted at the front...
 	sbc lbtoinsert
 	lda hb,y
 	sbc hbtoinsert
 	bmi insert_elt'not'smallest
 	tya				; Set its next pointer to the
 	sta next,x			; old head, update the head
 	stx head			; pointer, and return.
 	rts
 insert_elt'not'smallest:
 	ldx head
 insert_elt'loop:			; At this point, we know that
 	lda next,x			; argument > data[X].
 	tay
 	cpy #$FF			; if next[X] = #$FF, insert arg at end.
 	beq insert_elt'insert'after'current
 	lda lb,y			; Otherwise, compare arg to
 	sec				; data[next[X]].  If we insert
 	sbc lbtoinsert			; before that...
 	lda hb,y
 	sbc hbtoinsert
 	bmi insert_elt'goto'next
 insert_elt'insert'after'current:	; Fix up all the next links
 	tya
 	ldy indextoinsert
 	sta next,y
 	tya
 	sta next,x
 	rts				; and return.
 insert_elt'goto'next:			; Otherwise, let X = next[X]
 	tya				; and go looping again.
 	tax
 	jmp insert_elt'loop
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; print'unsorted: Steps through the data array and prints each value.
 ; Standalone procedure.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 print'unsorted:
 	lda #<unsorted'hdr
 	ldx #>unsorted'hdr
 	jsr put'string
 	ldy #$00
 print'unsorted'loop:
 	lda hb, Y
 	jsr print'hex
 	lda lb, y
 	jsr print'hex
 	lda #$20
 	jsr chrout
 	iny
 	cpy #$10
 	bne print'unsorted'loop
 	lda #$0D
 	jsr chrout
 	rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; print'list: Starts at head, and prints out every value in the
 ;             linked list.
 ; Standalone procedure.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 print'list:
 	lda #<sorted'hdr
 	ldx #>sorted'hdr
 	jsr put'string
 	ldy head
 print'list'loop:
 	cpy #$FF
 	beq print'list'done
 	lda hb, y
 	jsr print'hex
 	lda lb, y
 	jsr print'hex
 	lda #$20
 	jsr chrout
 	lda next, Y
 	tay
 	jmp print'list'loop
 print'list'done:
 	lda #$0d
 	jsr chrout
 	rts
 ;; String data for the above routines.
 unsorted'hdr:
 	.byte 147		; Clear screen first!
 	.byte "UNSORTED DATA:",13,0
 sorted'hdr:
 	.byte "SORTED DATA:",13,0
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; print'hex: outputs a two-character hex representation of a one-
 ;            byte value.
 ; Arguments: Byte to print in accumulator
 ; Modifies: .A and .X
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 print'hex:
 	pha
 	clc
 	lsr
 	lsr
 	lsr
 	lsr
 	tax
 	lda hexstr,x
 	jsr chrout
 	pla
 	and #$0F
 	tax
 	lda hexstr,X
 	jsr chrout
 	rts
 ; Character data array for print'hex.
 hexstr: .byte "0123456789ABCDEF"
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; put'string: outputs a C-style null terminated string with length
 ;             less than 256 to the screen.  If 256 bytes are written
 ;             without finding a terminator, the routine ends quietly.
 ; Arguments: Low byte of string address in .A, high byte in .X
 ; Modifies: .A and .Y
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 .data zp
 .space put'string'addr 2
 .text
 put'string:
 	sta put'string'addr
 	stx put'string'addr+1
 	ldy #$00
 put'string'loop:
 	lda (put'string'addr),y
 	beq put'string'done
 	jsr chrout
 	iny
 	bne put'string'loop
 put'string'done:
 	rts